This dataset contains house sale prices for King County, which includes Seattle. It includes homes sold between May 2014 and May 2015.
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import os
import time
# random state
SEED = 0
RNG = np.random.RandomState(SEED)
# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 50)
pd.set_option('display.float_format', '{:,.2g}'.format) # numbers sep by comma
pd.set_option('display.max_rows', 20) # None for all the rows
pd.set_option('display.max_colwidth', 50)
import IPython
from IPython.display import display
print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])
[('numpy', '1.16.4'), ('pandas', '0.25.0'), ('seaborn', '0.9.0'), ('matplotlib', '3.1.1')]
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.tools as tls
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=False)
[(x.__name__,x.__version__) for x in [plotly]]
[('plotly', '3.10.0')]
%load_ext autoreload
%autoreload 2
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
df = pd.read_csv('../data/processed/data_cleaned_encoded.csv')
print(df.shape)
df.head().T
(21613, 92)
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
id | 7129300520 | 6414100192 | 5631500400 | 2487200875 | 1954400510 |
date | 2014-10-13 | 2014-12-09 | 2015-02-25 | 2014-12-09 | 2015-02-18 |
price | 2.2e+05 | 5.4e+05 | 1.8e+05 | 6e+05 | 5.1e+05 |
bedrooms | 3 | 3 | 2 | 4 | 3 |
bathrooms | 1 | 2.2 | 1 | 3 | 2 |
sqft_living | 1180 | 2570 | 770 | 1960 | 1680 |
sqft_lot | 5650 | 7242 | 10000 | 5000 | 8080 |
floors | 1 | 2 | 1 | 1 | 1 |
waterfront | 0 | 0 | 0 | 0 | 0 |
view | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... |
age_after_renovation_cat_7 | 0 | 0 | 1 | 0 | 0 |
age_after_renovation_cat_8 | 0 | 0 | 0 | 0 | 0 |
age_after_renovation_cat_9 | 0 | 0 | 0 | 0 | 0 |
log1p_price | 12 | 13 | 12 | 13 | 13 |
log1p_sqft_living | 7.1 | 7.9 | 6.6 | 7.6 | 7.4 |
log1p_sqft_lot | 8.6 | 8.9 | 9.2 | 8.5 | 9 |
log1p_sqft_above | 7.1 | 7.7 | 6.6 | 7 | 7.4 |
log1p_sqft_basement | 0 | 6 | 0 | 6.8 | 0 |
log1p_sqft_living15 | 7.2 | 7.4 | 7.9 | 7.2 | 7.5 |
log1p_sqft_lot15 | 8.6 | 8.9 | 9 | 8.5 | 8.9 |
92 rows × 5 columns
from bhishan.util_viz_plotly import plotly_corr_heatmap
plotly_corr_heatmap(df,'price',show=False,
ofile='../reports/html/correlation_heatmap.html',
auto_open=True)
from bhishan.util_viz_plotly import plotly_countplot
plotly_countplot(df,'bedrooms',topN=5,show=False,
ofile='../reports/html/countplot_grade.html',
auto_open=True)
from bhishan.util_viz_plotly import plotly_countplot
plotly_countplot(df,'grade',topN=5,color='plum',show=False,
ofile='../reports/html/countplot_grade.html',
auto_open=True)
from bhishan.util_viz_plotly import plotly_histogram
plotly_histogram(df,'sqft_living',show=False,
ofile='../reports/html/histogram_sqft_living.html',
auto_open=True)
from bhishan.util_viz_plotly import plotly_histogram
plotly_histogram(df,'yr_built',size=1,color='orchid',show=False,
ofile='../reports/html/histogram_yr_built.html',
auto_open=True)
sns.distplot(df['sqft_living'], hist=True, kde=True, rug=False,
norm_hist=True, kde_kws={"label": "sqrt_living"})
<matplotlib.axes._subplots.AxesSubplot at 0x11c3576d8>
from bhishan.util_viz_plotly import plotly_distplot
cols = 'sqft_living'
plotly_distplot(df, cols,show=False,
ofile='../reports/html/distplot_sqft_living.html',
auto_open=True)
sns.boxplot(y=df['price'])
<matplotlib.axes._subplots.AxesSubplot at 0x120b56438>
from bhishan.util_viz_plotly import plotly_boxplot
col = 'price'
plotly_boxplot(df,col,show=False,
ofile='../reports/html/boxplot_price.html',
auto_open=True)
sns.boxplot(x=df['bedrooms'],y=df['price'])
<matplotlib.axes._subplots.AxesSubplot at 0x12059c2b0>
from bhishan.util_viz_plotly import plotly_boxplot_categorical_column
plotly_boxplot_categorical_column(df,'bedrooms','price',show=False,
ofile='../reports/html/boxplot_bedrooms_vs_price.html',
auto_open=True)
from bhishan.util_viz_plotly import plotly_boxplot
plotly_boxplot(df,['bedrooms','bathrooms','floors'],ylim_lst=[0,13],
show=False,ofile='../reports/html/boxplot_bedrooms_bathrooms_floors.html',
auto_open=True)
from bhishan.util_viz_plotly import plotly_scattergl_plot
xcol = 'sqft_living15'
ycol = 'price'
color = 'grade'
colorscale = 'Reds'
plotly_scattergl_plot(df,xcol,ycol,logy=True,bestfit=True,
color=color,colorscale=colorscale,
show=False,auto_open=True,
ofile='../reports/html/sqft_living15_vs_price_wrt_grade.html')
import seaborn as sns
sns.regplot(df.sqft_living15, np.log(df.price))
<matplotlib.axes._subplots.AxesSubplot at 0x1209feb70>
from bhishan.util_viz_plotly import plotly_scattergl_plot_colorcol
plotly_scattergl_plot_colorcol(df,'sqft_living15','price',
colorcol='bedrooms',logy=True,show=False,
ofile='../reports/html/sqft_living15_vs_price.html',
auto_open=True)
from bhishan.util_viz_plotly import plotly_scattergl_plot_subplots
subplot_cols=['bedrooms', 'bathrooms', 'condition', 'grade', 'waterfront']
plotly_scattergl_plot_subplots(df,'sqft_living15','price',
subplot_cols,logy=True,
show=False,
ofile='../reports/html/categorical_subplots.html')
This is the format of your plot grid: [ (1,1) x1,y1 ] [ (2,1) x2,y2 ] [ (3,1) x3,y3 ] [ (4,1) x4,y4 ] [ (5,1) x5,y5 ]
from bhishan.util_viz_plotly import plotly_bubbleplot
df1 = df[df.yr_built == 2014]
plotly_bubbleplot(df1, 'grade', 'bedrooms', 'bathrooms', 'floors',
size_factor=6,show=False,
ofile='../reports/html/bubbleplot1.html')
from bhishan.util_viz_plotly import plotly_bubbleplot
df1 = df[df.yr_built == 2015]
plotly_bubbleplot(df1, 'grade', 'bedrooms', 'bathrooms', 'floors',
size_factor=5,show=False,
ofile='../reports/html/bubbleplot2.html')
# select your dataframe
df1 = df.query(""" grade >= 7 and yr_built >= 2000 """)[['lat','long','grade','yr_built','price']].copy()
# create new column of text
df1['text'] = ( "Grade:" + df1.grade.apply(str)
+ " Built Year:"+ df1.yr_built.apply(str)
+ " Price:"+ df1.price.apply(str)
)
df1.head()
lat | long | grade | yr_built | price | text | |
---|---|---|---|---|---|---|
5 | 48 | -1.2e+02 | 11 | 2001 | 1.2e+06 | Grade:11 Built Year:2001 Price:1225000.0 |
9 | 47 | -1.2e+02 | 7 | 2003 | 3.2e+05 | Grade:7 Built Year:2003 Price:323000.0 |
29 | 48 | -1.2e+02 | 8 | 2005 | 7.2e+05 | Grade:8 Built Year:2005 Price:719000.0 |
30 | 48 | -1.2e+02 | 8 | 2003 | 5.8e+05 | Grade:8 Built Year:2003 Price:580500.0 |
31 | 48 | -1.2e+02 | 7 | 2005 | 2.8e+05 | Grade:7 Built Year:2005 Price:280000.0 |
from bhishan.util_viz_plotly import plotly_mapbox
plotly_mapbox(df1, 'lat', 'long', color_col='grade',
ofile='../reports/html/mymap.html',
show=False)
<span style="background:#2f4f4f"> Grade 11 </span> <span style="background:#800000"> Grade 7 </span> <span style="background:#006400"> Grade 8 </span> <span style="background:#000080"> Grade 9 </span> <span style="background:#9acd32"> Grade 10 </span> <span style="background:#ff0000"> Grade 12 </span> <span style="background:#ff8c00"> Grade 13 </span>
Grade 11 Grade 7 Grade 8 Grade 9 Grade 10 Grade 12 Grade 13