This dataset contains house sale prices for King County, which includes Seattle. It includes homes sold between May 2014 and May 2015.
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import os
import time
# random state
SEED = 0
RNG = np.random.RandomState(SEED)
# Jupyter notebook settings for pandas
pd.set_option('display.max_columns', 50)
pd.set_option('display.float_format', '{:,.2g}'.format) # numbers sep by comma
pd.set_option('display.max_rows', 20) # None for all the rows
pd.set_option('display.max_colwidth', 50)
import IPython
from IPython.display import display
print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])
[('numpy', '1.16.4'), ('pandas', '0.25.0'), ('seaborn', '0.9.0'), ('matplotlib', '3.1.1')]
%load_ext autoreload
%autoreload 2
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
import bokeh
from bokeh.io import output_file, output_notebook
from bokeh.plotting import figure, show, reset_output
from bokeh.models import ColumnDataSource
from bokeh.layouts import row, column, gridplot
from bokeh.models.widgets import Tabs, Panel
from bokeh.palettes import Spectral6
from bokeh.models import ColumnDataSource,FactorRange
# Output the visualization directly in the notebook
output_notebook()
[(x.__name__,x.__version__) for x in [bokeh]]
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
def show_method_attributes(method, ncols=7):
""" Show all the attributes of a given method.
Example:
========
show_method_attributes(list)
"""
x = [i for i in dir(method) if i[0]!='_']
x = [i for i in x if i not in 'os np pd sys time psycopg2'.split()]
return pd.DataFrame(np.array_split(x,ncols)).T.fillna('')
df = pd.read_csv('../data/processed/data_cleaned_encoded.csv')
print(df.shape)
df.head().T
(21613, 92)
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
id | 7129300520 | 6414100192 | 5631500400 | 2487200875 | 1954400510 |
date | 2014-10-13 | 2014-12-09 | 2015-02-25 | 2014-12-09 | 2015-02-18 |
price | 2.2e+05 | 5.4e+05 | 1.8e+05 | 6e+05 | 5.1e+05 |
bedrooms | 3 | 3 | 2 | 4 | 3 |
bathrooms | 1 | 2.2 | 1 | 3 | 2 |
sqft_living | 1180 | 2570 | 770 | 1960 | 1680 |
sqft_lot | 5650 | 7242 | 10000 | 5000 | 8080 |
floors | 1 | 2 | 1 | 1 | 1 |
waterfront | 0 | 0 | 0 | 0 | 0 |
view | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... |
age_after_renovation_cat_7 | 0 | 0 | 1 | 0 | 0 |
age_after_renovation_cat_8 | 0 | 0 | 0 | 0 | 0 |
age_after_renovation_cat_9 | 0 | 0 | 0 | 0 | 0 |
log1p_price | 12 | 13 | 12 | 13 | 13 |
log1p_sqft_living | 7.1 | 7.9 | 6.6 | 7.6 | 7.4 |
log1p_sqft_lot | 8.6 | 8.9 | 9.2 | 8.5 | 9 |
log1p_sqft_above | 7.1 | 7.7 | 6.6 | 7 | 7.4 |
log1p_sqft_basement | 0 | 6 | 0 | 6.8 | 0 |
log1p_sqft_living15 | 7.2 | 7.4 | 7.9 | 7.2 | 7.5 |
log1p_sqft_lot15 | 8.6 | 8.9 | 9 | 8.5 | 8.9 |
92 rows × 5 columns
df['bedrooms'].value_counts()
3 9824 4 6882 2 2760 5 1601 6 272 1 199 7 38 8 13 0 13 9 6 10 3 11 1 33 1 Name: bedrooms, dtype: int64
from bhishan.util_bokeh import countplot_bokeh
ofile = '../reports/bokeh_outputs/bedrooms_countplot.html'
countplot_bokeh(df, 'bedrooms',height=400,ofile=None)
from bhishan.util_bokeh import histogram_bokeh
histogram_bokeh(df,'sqft_living',n_bins=20)
from bhishan.util_bokeh import scatterplot_bokeh
ofile = '../reports/bokeh_outputs/sqftLiving_vs_price.html'
scatterplot_bokeh(df,'sqft_living','price',ofile=ofile)
from bhishan.util_bokeh import stacked_countplot_bokeh
stacked_countplot_bokeh(df,'bedrooms','yr_sales','price')
df[['lat','long']].head()
lat | long | |
---|---|---|
0 | 48 | -1.2e+02 |
1 | 48 | -1.2e+02 |
2 | 48 | -1.2e+02 |
3 | 48 | -1.2e+02 |
4 | 48 | -1.2e+02 |
from bhishan.util_bokeh import map_plot_bokeh
ofile = '../reports/bokeh_outputs/map.html'
map_plot_bokeh(df, 'lat', 'long',ofile)