This dataset contains house sale prices for King County, which includes Seattle. It includes homes sold between May 2014 and May 2015.
%load_ext autoreload
%autoreload 2
from bhishan.util_ds import get_column_descriptions
from bhishan.util_viz import count_plot
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import os
import time
# random state
SEED = 0
RNG = np.random.RandomState(SEED)
# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 50)
pd.set_option('display.float_format', '{:,.2g}'.format) # numbers sep by comma
pd.set_option('display.max_rows', 50) # None for all the rows
pd.set_option('display.max_colwidth', 50)
import IPython
from IPython.display import display
print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])
[('numpy', '1.16.4'), ('pandas', '0.25.0'), ('seaborn', '0.9.0'), ('matplotlib', '3.1.1')]
import scipy
from scipy import stats # pointbiserialr
from scipy import linalg
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
df = pd.read_csv('../data/processed/data_cleaned_encoded.csv')
print(df.shape)
df.head().T
(21613, 92)
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
id | 7129300520 | 6414100192 | 5631500400 | 2487200875 | 1954400510 |
date | 2014-10-13 | 2014-12-09 | 2015-02-25 | 2014-12-09 | 2015-02-18 |
price | 2.2e+05 | 5.4e+05 | 1.8e+05 | 6e+05 | 5.1e+05 |
bedrooms | 3 | 3 | 2 | 4 | 3 |
bathrooms | 1 | 2.2 | 1 | 3 | 2 |
sqft_living | 1180 | 2570 | 770 | 1960 | 1680 |
sqft_lot | 5650 | 7242 | 10000 | 5000 | 8080 |
floors | 1 | 2 | 1 | 1 | 1 |
waterfront | 0 | 0 | 0 | 0 | 0 |
view | 0 | 0 | 0 | 0 | 0 |
condition | 3 | 3 | 3 | 5 | 3 |
grade | 7 | 7 | 6 | 7 | 8 |
sqft_above | 1180 | 2170 | 770 | 1050 | 1680 |
sqft_basement | 0 | 400 | 0 | 910 | 0 |
yr_built | 1955 | 1951 | 1933 | 1965 | 1987 |
yr_renovated | 0 | 1991 | 0 | 0 | 0 |
zipcode | 98178 | 98125 | 98028 | 98136 | 98074 |
lat | 48 | 48 | 48 | 48 | 48 |
long | -1.2e+02 | -1.2e+02 | -1.2e+02 | -1.2e+02 | -1.2e+02 |
sqft_living15 | 1340 | 1690 | 2720 | 1360 | 1800 |
sqft_lot15 | 5650 | 7639 | 8062 | 5000 | 7503 |
yr_sales | 2014 | 2014 | 2015 | 2014 | 2015 |
age | 59 | 63 | 82 | 49 | 28 |
yr_renovated2 | 1955 | 1991 | 1933 | 1965 | 1987 |
age_after_renovation | 59 | 23 | 82 | 49 | 28 |
... | ... | ... | ... | ... | ... |
age_cat_2 | 0 | 0 | 0 | 0 | 1 |
age_cat_3 | 0 | 0 | 0 | 0 | 0 |
age_cat_4 | 0 | 0 | 0 | 1 | 0 |
age_cat_5 | 1 | 1 | 0 | 0 | 0 |
age_cat_6 | 0 | 0 | 0 | 0 | 0 |
age_cat_7 | 0 | 0 | 1 | 0 | 0 |
age_cat_8 | 0 | 0 | 0 | 0 | 0 |
age_cat_9 | 0 | 0 | 0 | 0 | 0 |
age_after_renovation_cat_0 | 0 | 0 | 0 | 0 | 0 |
age_after_renovation_cat_1 | 0 | 0 | 0 | 0 | 0 |
age_after_renovation_cat_2 | 0 | 1 | 0 | 0 | 1 |
age_after_renovation_cat_3 | 0 | 0 | 0 | 0 | 0 |
age_after_renovation_cat_4 | 0 | 0 | 0 | 1 | 0 |
age_after_renovation_cat_5 | 1 | 0 | 0 | 0 | 0 |
age_after_renovation_cat_6 | 0 | 0 | 0 | 0 | 0 |
age_after_renovation_cat_7 | 0 | 0 | 1 | 0 | 0 |
age_after_renovation_cat_8 | 0 | 0 | 0 | 0 | 0 |
age_after_renovation_cat_9 | 0 | 0 | 0 | 0 | 0 |
log1p_price | 12 | 13 | 12 | 13 | 13 |
log1p_sqft_living | 7.1 | 7.9 | 6.6 | 7.6 | 7.4 |
log1p_sqft_lot | 8.6 | 8.9 | 9.2 | 8.5 | 9 |
log1p_sqft_above | 7.1 | 7.7 | 6.6 | 7 | 7.4 |
log1p_sqft_basement | 0 | 6 | 0 | 6.8 | 0 |
log1p_sqft_living15 | 7.2 | 7.4 | 7.9 | 7.2 | 7.5 |
log1p_sqft_lot15 | 8.6 | 8.9 | 9 | 8.5 | 8.9 |
92 rows × 5 columns
df_corr = df.corr(method='pearson')
cols10 = df_corr.nlargest(10, 'price').index
df_corr = df[cols10].corr()
df_corr.style.background_gradient(cmap='coolwarm', axis=None)
price | log1p_price | sqft_living | grade | log1p_sqft_living | sqft_above | sqft_living15 | log1p_sqft_living15 | log1p_sqft_above | bathrooms | |
---|---|---|---|---|---|---|---|---|---|---|
price | 1 | 0.891654 | 0.702035 | 0.667434 | 0.611757 | 0.605567 | 0.585379 | 0.544014 | 0.542774 | 0.525138 |
log1p_price | 0.891654 | 1 | 0.695341 | 0.703634 | 0.67494 | 0.601802 | 0.619312 | 0.607201 | 0.586322 | 0.550802 |
sqft_living | 0.702035 | 0.695341 | 1 | 0.762704 | 0.954368 | 0.876597 | 0.75642 | 0.732194 | 0.84324 | 0.754665 |
grade | 0.667434 | 0.703634 | 0.762704 | 1 | 0.743711 | 0.755923 | 0.713202 | 0.688419 | 0.743416 | 0.664983 |
log1p_sqft_living | 0.611757 | 0.67494 | 0.954368 | 0.743711 | 1 | 0.832336 | 0.736567 | 0.746137 | 0.865382 | 0.761316 |
sqft_above | 0.605567 | 0.601802 | 0.876597 | 0.755923 | 0.832336 | 1 | 0.73187 | 0.701817 | 0.962353 | 0.685342 |
sqft_living15 | 0.585379 | 0.619312 | 0.75642 | 0.713202 | 0.736567 | 0.73187 | 1 | 0.976821 | 0.714572 | 0.568634 |
log1p_sqft_living15 | 0.544014 | 0.607201 | 0.732194 | 0.688419 | 0.746137 | 0.701817 | 0.976821 | 1 | 0.712634 | 0.570834 |
log1p_sqft_above | 0.542774 | 0.586322 | 0.84324 | 0.743416 | 0.865382 | 0.962353 | 0.714572 | 0.712634 | 1 | 0.694954 |
bathrooms | 0.525138 | 0.550802 | 0.754665 | 0.664983 | 0.761316 | 0.685342 | 0.568634 | 0.570834 | 0.694954 | 1 |
plt.figure(figsize=(12,8))
plt.subplots_adjust(bottom=0.01)
mask = np.zeros_like(df_corr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(df_corr, cbar=True, annot=True, fmt='.2f',mask=mask)
plt.tight_layout()
plt.savefig('../reports/figures/correlation_matrix.png',dpi=300)
features_raw_all = ['price', 'bedrooms', 'bathrooms', 'sqft_living',
'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
'lat', 'long', 'sqft_living15', 'sqft_lot15']
df1= df[features_raw_all]
h = df1.hist(bins=25,figsize=(24,24),xlabelsize='20',ylabelsize='20',xrot=-20)
sns.despine(left=True, bottom=True)
[x.title.set_size(24) for x in h.ravel()];
[x.yaxis.tick_left() for x in h.ravel()];
plt.tight_layout()
plt.savefig('../reports/figures/histograms_of_all_features.png',dpi=300)
df.filter(regex='yr|price').columns
Index(['price', 'yr_built', 'yr_renovated', 'yr_sales', 'yr_renovated2', 'log1p_price'], dtype='object')
df.groupby('yr_sales').agg({'price': 'median'}).plot(marker='o')
<matplotlib.axes._subplots.AxesSubplot at 0x1209a15f8>
df.groupby('yr_built').agg({'price': 'median'}).plot(marker='o')
<matplotlib.axes._subplots.AxesSubplot at 0x120e65d30>
df[df['yr_built']==2014]['price'].plot.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x123a3e668>
df.plot.scatter(x='yr_sales',y='price',c='r')
<matplotlib.axes._subplots.AxesSubplot at 0x1252352e8>
Features that we can count like 1,2,3 are called discrete variables. For example
A simple check of discrete variable is look at number of unique labels and look at features having integer dtype. Sometimes features can have float dtype but still be a discrete variable.
features_raw_all = ['price', 'bedrooms', 'bathrooms', 'sqft_living',
'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
'lat', 'long', 'sqft_living15', 'sqft_lot15']
Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'yr_sales', 'age', 'yr_renovated2', 'age_after_renovation', 'zipcode_top10', 'zipcode_houses', 'basement_bool', 'renovation_bool', 'age_cat', 'age_after_renovation_cat', 'waterfront_0', 'waterfront_1', 'view_0', 'view_1', 'view_2', 'view_3', 'view_4', 'condition_1', 'condition_2', 'condition_3', 'condition_4', 'condition_5', 'grade_1', 'grade_10', 'grade_11', 'grade_12', 'grade_13', 'grade_3', 'grade_4', 'grade_5', 'grade_6', 'grade_7', 'grade_8', 'grade_9', 'zipcode_top10_98004', 'zipcode_top10_98006', 'zipcode_top10_98033', 'zipcode_top10_98039', 'zipcode_top10_98040', 'zipcode_top10_98102', 'zipcode_top10_98105', 'zipcode_top10_98155', 'zipcode_top10_98177', 'zipcode_top10_others', 'age_cat_0', 'age_cat_1', 'age_cat_2', 'age_cat_3', 'age_cat_4', 'age_cat_5', 'age_cat_6', 'age_cat_7', 'age_cat_8', 'age_cat_9', 'age_after_renovation_cat_0', 'age_after_renovation_cat_1', 'age_after_renovation_cat_2', 'age_after_renovation_cat_3', 'age_after_renovation_cat_4', 'age_after_renovation_cat_5', 'age_after_renovation_cat_6', 'age_after_renovation_cat_7', 'age_after_renovation_cat_8', 'age_after_renovation_cat_9', 'log1p_price', 'log1p_sqft_living', 'log1p_sqft_lot', 'log1p_sqft_above', 'log1p_sqft_basement', 'log1p_sqft_living15', 'log1p_sqft_lot15'], dtype='object')
cols_num = [var for var in features_raw_all if df[var].dtypes != 'O']
cols_time = [var for var in cols_num if 'yr' in var or 'year' in var]
cols_discrete = [var for var in cols_num if len(df[var].unique())<20
and var not in cols_time +['id']]
print('Number of discrete variables: ', len(cols_discrete))
df[cols_discrete].head()
Number of discrete variables: 6
bedrooms | floors | waterfront | view | condition | grade | |
---|---|---|---|---|---|---|
0 | 3 | 1 | 0 | 0 | 3 | 7 |
1 | 3 | 2 | 0 | 0 | 3 | 7 |
2 | 2 | 1 | 0 | 0 | 3 | 6 |
3 | 4 | 1 | 0 | 0 | 5 | 7 |
4 | 3 | 1 | 0 | 0 | 3 | 8 |
for c in cols_discrete:
sns.countplot(df[c], order = df[c].value_counts().index)
plt.show()
# sns.countplot(df.bedrooms, order = df['bedrooms'].value_counts().index)
from bhishan.util_viz import count_plot
count_plot(df,'bedrooms',bottom=200, ofile='../reports/figures/bedrooms_counts.png')
count_plot(df,'grade',bottom=200,ofile='../reports/figures/grade_counts.png')
df['sqft_living'].describe()
count 2.2e+04 mean 2.1e+03 std 9.2e+02 min 2.9e+02 25% 1.4e+03 50% 1.9e+03 75% 2.6e+03 max 1.4e+04 Name: sqft_living, dtype: float64
plt.hist('sqft_living', data = df, bins = 5);
df.columns
Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'yr_sales', 'age', 'yr_renovated2', 'age_after_renovation', 'zipcode_top10', 'zipcode_houses', 'basement_bool', 'renovation_bool', 'age_cat', 'age_after_renovation_cat', 'waterfront_0', 'waterfront_1', 'view_0', 'view_1', 'view_2', 'view_3', 'view_4', 'condition_1', 'condition_2', 'condition_3', 'condition_4', 'condition_5', 'grade_1', 'grade_10', 'grade_11', 'grade_12', 'grade_13', 'grade_3', 'grade_4', 'grade_5', 'grade_6', 'grade_7', 'grade_8', 'grade_9', 'zipcode_top10_98004', 'zipcode_top10_98006', 'zipcode_top10_98033', 'zipcode_top10_98039', 'zipcode_top10_98040', 'zipcode_top10_98102', 'zipcode_top10_98105', 'zipcode_top10_98155', 'zipcode_top10_98177', 'zipcode_top10_others', 'age_cat_0', 'age_cat_1', 'age_cat_2', 'age_cat_3', 'age_cat_4', 'age_cat_5', 'age_cat_6', 'age_cat_7', 'age_cat_8', 'age_cat_9', 'age_after_renovation_cat_0', 'age_after_renovation_cat_1', 'age_after_renovation_cat_2', 'age_after_renovation_cat_3', 'age_after_renovation_cat_4', 'age_after_renovation_cat_5', 'age_after_renovation_cat_6', 'age_after_renovation_cat_7', 'age_after_renovation_cat_8', 'age_after_renovation_cat_9', 'log1p_price', 'log1p_sqft_living', 'log1p_sqft_lot', 'log1p_sqft_above', 'log1p_sqft_basement', 'log1p_sqft_living15', 'log1p_sqft_lot15'], dtype='object')
cols_logs = ['price', 'sqft_living', 'sqft_lot',
'sqft_above','sqft_basement', 'sqft_living15',
'sqft_lot15']
colors10_hex = ['#b03060','#ff0000', '#ff00ff',
'#67ceab', '#63c56c', '#225e31',
'#29b6f6', '#6495ed','#00008b',
'#ffa500']
for i,c in enumerate(cols_logs):
fig = plt.figure(figsize=(20,5))
plt.subplot(1, 2, 1)
sns.distplot(df[c], hist=True, kde=True, rug=False,
norm_hist=True, kde_kws={"label": c},
color=colors10_hex[i])
# add another plot
plt.subplot(1, 2, 2)
sns.distplot(df['log1p_'+c], hist=True, kde=True, rug=False,
norm_hist=True, kde_kws={"label": "log1p_"+c},
color=colors10_hex[i])
plt.show()
plt.figure(figsize=(12,8))
sns.distplot(df['sqft_living'], hist=True, kde=True, rug=False,
norm_hist=True, kde_kws={"label": "sqrt_living"})
sns.distplot(df['sqft_living15'], hist=True, kde=True, rug=False,
norm_hist=True,kde_kws={"label": "sqrt_living15"})
plt.tight_layout()
plt.savefig('../reports/figures/sqft_living_distplot.png')
Variables such as district names, country names, class names etc are categorical variables. Sometimes discrete variables like #bedrooms #floors can be treated as categorical variables.
When a categorical variable has a class less than 1% of the data, this may cause overfitting and we may want to rename it to RARE and may also drop these rows.
df['waterfront'].value_counts()
0 21450 1 163 Name: waterfront, dtype: int64
fig = plt.figure(figsize=(20,5))
plt.subplot(1, 2, 1)
ax0= sns.countplot(y="waterfront", data=df)
# add another plot
plt.subplot(1, 2, 2)
sns.distplot(df['sqft_living'], hist=True, kde=True, rug=False,
norm_hist=True, kde_kws={"label": "sqrt_living"})
ax1 = sns.distplot(df['sqft_living15'], hist=True, kde=True, rug=False,
norm_hist=True,kde_kws={"label": "sqrt_living15"})
ax0.tick_params(axis='both', which='both', labelsize=14)
ax1.tick_params(axis='both', which='both', labelsize=14)
ax0.set_ylabel('waterfront',fontsize=14)
ax0.set_xlabel('count',fontsize=14)
ax1.set_ylabel('')
ax1.set_xlabel('count',fontsize=14)
# plt.tight_layout()
plt.show()
# [ i for i in dir(ax0) if i[:3]=='set']
plt.figure(figsize=(12,8))
sns.boxplot(y = 'waterfront', x = 'price', data = df,width = 0.8,
orient = 'h', showmeans = True, fliersize = 3)
plt.xticks(rotation=90,fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel('price', fontsize=16)
plt.ylabel('waterfront', fontsize=16)
plt.savefig('../reports/figures/waterfront_boxplots.png')
# Calculate the correlation coefficient
r, p = stats.pointbiserialr(df['waterfront'], df['price'])
print ('point biserial correlation r is {:.2f} with p = {:.2f}'.format(r,p))
point biserial correlation r is 0.27 with p = 0.00
Observation
We have not tested the assumptions of point-biserial correlation
plt.figure(figsize=(12,8))
sns.boxplot(y = 'basement_bool', x = 'price', data = df,width = 0.8,
orient = 'h', showmeans = True, fliersize = 3)
plt.xticks(rotation=90,fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel('price', fontsize=16)
plt.ylabel('basement', fontsize=16)
plt.savefig('../reports/figures/basement_boxplots.png')
r, p = stats.pointbiserialr(df['basement_bool'], df['price'])
print ('point biserial correlation r is {:.2f} with p = {:.2f}'.format(r,p))
point biserial correlation r is 0.18 with p = 0.00
plt.figure(figsize=(12,8))
sns.boxplot(y = 'renovation_bool', x = 'price', data = df,width = 0.8,
orient = 'h', showmeans = True, fliersize = 3)
plt.xticks(rotation=90,fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel('price', fontsize=16)
plt.ylabel('renovation', fontsize=16)
plt.savefig('../reports/figures/renovation_boxplots.png')
r, p = stats.pointbiserialr(df['renovation_bool'], df['price'])
print ('point biserial correlation r is {:.2f} with p = {:.2f}'.format(r,p))
point biserial correlation r is 0.13 with p = 0.00
cols_cat = ['bedrooms' ,'bathrooms',
'floors', 'view',
'condition','grade']
f, ax = plt.subplots(3, 2,figsize=(14,14))
sns.boxplot(x=df['bedrooms'],y=df['price'], ax=ax[0][0])
sns.boxplot(x=df['bathrooms'],y=df['price'], ax=ax[0][1])
sns.boxplot(x=df['waterfront'],y=df['price'], ax=ax[1][0])
sns.boxplot(x=df['view'],y=df['price'], ax=ax[1][1])
sns.boxplot(x=df['condition'],y=df['price'], ax=ax[2][0])
sns.boxplot(x=df['grade'],y=df['price'], ax=ax[2][1])
sns.despine(left=True, bottom=True)
ax[0][0].set_xlabel('Bedrooms', fontsize=14)
ax[0][0].set_ylabel('Price', fontsize=14)
ax[0][0].tick_params(axis='x', labelsize=12)
ax[0][0].tick_params(axis='y', labelsize=12)
ax[0][0].yaxis.tick_left()
ax[0][1].yaxis.set_label_position("right")
ax[0][1].yaxis.tick_right()
ax[0][1].set_xlabel('Bathrooms', fontsize=14)
ax[0][1].set_ylabel('Price', fontsize=14)
ax[0][1].tick_params(axis='x', labelsize=12)
ax[0][1].tick_params(axis='y', labelsize=12)
ax[0][1].set_xticklabels(ax[0][1].get_xticklabels(),rotation=90)
ax[1][0].set_xlabel('Waterfront', fontsize=14)
ax[1][0].set_ylabel('Price', fontsize=14)
ax[1][0].tick_params(axis='x', labelsize=12)
ax[1][0].tick_params(axis='y', labelsize=12)
ax[1][0].yaxis.tick_left()
ax[1][1].yaxis.set_label_position("right")
ax[1][1].yaxis.tick_right()
ax[1][1].set(xlabel='View', ylabel='Price')
ax[1][1].tick_params(axis='x', labelsize=12)
ax[1][1].tick_params(axis='y', labelsize=12)
ax[2][0].set_xlabel('Condition', fontsize=14)
ax[2][0].set_ylabel('Price', fontsize=14)
ax[2][0].tick_params(axis='x', labelsize=12)
ax[2][0].tick_params(axis='y', labelsize=12)
ax[2][0].yaxis.tick_left()
ax[2][1].yaxis.set_label_position("right")
ax[2][1].yaxis.tick_right()
ax[2][1].set(xlabel='Grade', ylabel='Price')
ax[2][1].tick_params(axis='x', labelsize=12)
ax[2][1].tick_params(axis='y', labelsize=12)
plt.tight_layout()
plt.savefig('../reports/figures/categorical_features_boxplots.png')
df.columns
Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'yr_sales', 'age', 'yr_renovated2', 'age_after_renovation', 'zipcode_top10', 'zipcode_houses', 'basement_bool', 'renovation_bool', 'age_cat', 'age_after_renovation_cat', 'waterfront_0', 'waterfront_1', 'view_0', 'view_1', 'view_2', 'view_3', 'view_4', 'condition_1', 'condition_2', 'condition_3', 'condition_4', 'condition_5', 'grade_1', 'grade_10', 'grade_11', 'grade_12', 'grade_13', 'grade_3', 'grade_4', 'grade_5', 'grade_6', 'grade_7', 'grade_8', 'grade_9', 'zipcode_top10_98004', 'zipcode_top10_98006', 'zipcode_top10_98033', 'zipcode_top10_98039', 'zipcode_top10_98040', 'zipcode_top10_98102', 'zipcode_top10_98105', 'zipcode_top10_98155', 'zipcode_top10_98177', 'zipcode_top10_others', 'age_cat_0', 'age_cat_1', 'age_cat_2', 'age_cat_3', 'age_cat_4', 'age_cat_5', 'age_cat_6', 'age_cat_7', 'age_cat_8', 'age_cat_9', 'age_after_renovation_cat_0', 'age_after_renovation_cat_1', 'age_after_renovation_cat_2', 'age_after_renovation_cat_3', 'age_after_renovation_cat_4', 'age_after_renovation_cat_5', 'age_after_renovation_cat_6', 'age_after_renovation_cat_7', 'age_after_renovation_cat_8', 'age_after_renovation_cat_9', 'log1p_price', 'log1p_sqft_living', 'log1p_sqft_lot', 'log1p_sqft_above', 'log1p_sqft_basement', 'log1p_sqft_living15', 'log1p_sqft_lot15'], dtype='object')
stats.spearmanr(df['bedrooms'],df['price'])
SpearmanrResult(correlation=0.34465237095978885, pvalue=0.0)
stats.spearmanr(df['floors'],df['price'])
SpearmanrResult(correlation=0.32234655003563695, pvalue=0.0)
(df[['bedrooms','floors','view','condition','grade']]
.corrwith(df['price'],method='spearman')
.rename('spearmanr').rename_axis('column').reset_index())
column | spearmanr | |
---|---|---|
0 | bedrooms | 0.34 |
1 | floors | 0.32 |
2 | view | 0.29 |
3 | condition | 0.018 |
4 | grade | 0.66 |
colors = ['#006400', '#00008b', '#b03060',
'#ff0000', '#ffff00', '#00ff00',
'#00ffff', '#ff00ff', '#6495ed', '#ffdead']
fig,ax= plt.subplots(4,1, figsize=(12,18))
df.groupby(['bedrooms'])['price'].mean().sort_values().plot.barh(color=colors[0],ax=ax[0])
df.groupby(['floors'])['price'].mean().sort_values().plot.barh(color=colors[1],ax=ax[1])
df.groupby(['view'])['price'].mean().sort_values().plot.barh(color=colors[2],ax=ax[2])
df.groupby(['grade'])['price'].mean().sort_values().plot.barh(color=colors[3],ax=ax[3])
plt.savefig('../reports/figures/categorical_variables_barh_plots.png', dpi=300)
sns.jointplot(x="sqft_living", y="price", data=df, kind='reg', height=7)
plt.xticks(rotation=90,fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel('sqft_living', fontsize=16)
plt.ylabel('price', fontsize=16)
plt.savefig('../reports/figures/sqft_living_vs_price_jointplot.png', dpi=300)
from bhishan.util_viz import colors
def multiple_jointplots_with_pearsonr(cols,target):
import scipy
for i,col in enumerate(cols):
p = sns.jointplot(x=col, y=target, data=df, kind='reg',
height=5 ,color=colors[i])
r, _ = scipy.stats.pearsonr(df[col].values, df[target].values)
p.fig.text(0.3, 0.7, "pearsonr = {:.2f}".format(r), ha ='left',
fontsize = 15)
return plt
cols = ['sqft_living', 'sqft_living15', 'sqft_above']
multiple_jointplots_with_pearsonr(cols,'price')
plt.tight_layout()
plt.savefig('../reports/figures/multiple_jointplots_with_pearsonr.png',dpi=300)
Observation:
from bhishan.util_viz import corrplot_with_pearsonr
cols = ['sqft_living', 'sqft_living15', 'sqft_above']
corrplot_with_pearsonr(df,cols,ofile='../reports/figures/corrplot_with_pearsonr.png')
Observation
from bhishan.util_stats import partial_corr
cols = ['price', 'sqft_living', 'sqft_living15']
partial_corr(df,cols)
price | sqft_living | sqft_living15 | |
---|---|---|---|
price | 1 | 0.48 | 0.063 |
sqft_living | 0.48 | 1 | 0.78 |
sqft_living15 | 0.063 | 0.78 | 1 |
df[cols].corr(method='pearson') # full correlations.
price | sqft_living | sqft_living15 | |
---|---|---|---|
price | 1 | 0.7 | 0.59 |
sqft_living | 0.7 | 1 | 0.76 |
sqft_living15 | 0.59 | 0.76 | 1 |
stats.pearsonr(df['sqft_living'], df['price'])
(0.7020350546118002, 0.0)
stats.pearsonr(df['sqft_living15'], df['price'])
(0.585378903579568, 0.0)
Observation
from mpl_toolkits.mplot3d import Axes3D
fig=plt.figure(figsize=(19,12.5))
ax=fig.add_subplot(2,2,1, projection="3d")
ax.scatter(df['floors'],df['bedrooms'],df['bathrooms'],c="darkgreen",alpha=.5)
ax.set(xlabel='\nFloors',ylabel='\nBedrooms',zlabel='\nBathrooms / Bedrooms')
ax.set(ylim=[0,12])
ax=fig.add_subplot(2,2,2, projection="3d")
ax.scatter(df['floors'],df['bedrooms'],df['sqft_living'],c="darkgreen",alpha=.5)
ax.set(xlabel='\nFloors',ylabel='\nBedrooms',zlabel='\nsqft Living')
ax.set(ylim=[0,12])
ax=fig.add_subplot(2,2,3, projection="3d")
ax.scatter(df['sqft_living'],df['sqft_lot'],df['bathrooms'],c="darkgreen",alpha=.5)
ax.set(xlabel='\n sqft Living',ylabel='\nsqft Lot',zlabel='\nBathrooms / Bedrooms')
ax.set(ylim=[0,250000])
ax=fig.add_subplot(2,2,4, projection="3d")
ax.scatter(df['sqft_living'],df['sqft_lot'],df['bedrooms'],c="darkgreen",alpha=.5)
ax.set(xlabel='\n sqft Living',ylabel='\nsqft Lot',zlabel='Bedrooms')
ax.set(ylim=[0,250000])
plt.tight_layout()
plt.show()
fig=plt.figure(figsize=(9.5,6.25))
ax=fig.add_subplot(1,1,1, projection="3d")
ax.scatter(df['view'],df['grade'],df['yr_built'],c="darkgreen",alpha=.5)
ax.set(xlabel='\nView',ylabel='\nGrade',zlabel='\nYear Built');
sns.scatterplot(x='view',y='grade',data=df,hue='yr_built')
<matplotlib.axes._subplots.AxesSubplot at 0x11d8829e8>