import numpy as np
import pandas as pd
import os,sys,time
import re

time_start_notebook = time.time()

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# settings
SEED = 100
pd.set_option('max_columns',100)

%matplotlib inline
%load_ext watermark
%watermark -iv

numpy    1.19.5
autopep8 1.5.2
json     2.0.9
pandas   1.3.0
seaborn  0.11.0
re       2.2.1


!ls data/raw

Online Retail.xlsx         online_retail.parquet.gzip
head.png                   tail.png
mid.png


# %%time

# date_fmt = "%m/%d/%y %H:%M"
# df = pd.read_excel("data/raw/Online Retail.xlsx",parse_dates=['InvoiceDate'])
# print(df.shape)
# df.head().append(df.tail())

# Wall time: 57.1 s


# pd.read_excel?


# df_orig = df.copy()


# df.dtypes


# df.isna().sum()


"""
Note: we can write the data as parquet.
some of the invoice number statrs with letter 'c' and it gives problem.


df.to_parquet('data/raw/online_retail.parquet.gzip',compression='gzip')

ArrowInvalid: ("Could not convert 'C536379' with type str: tried to convert to int",
               'Conversion failed for column InvoiceNo with type object')
               
""";


# df.columns


# cols_str = ['InvoiceNo', 'StockCode', 'Description', 'CustomerID', 'Country']
# df[cols_str] = df[cols_str].astype(str)
# df.to_parquet('data/raw/online_retail.parquet.gzip',compression='gzip')


%%time
ifile = 'data/raw/online_retail.parquet.gzip'

df = pd.read_parquet(ifile)

CPU times: user 266 ms, sys: 61.1 ms, total: 327 ms
Wall time: 241 ms


df.head().append(df.tail())


df.dtypes

InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID             object
Country                object
dtype: object


df.Country.value_counts()

United Kingdom          495478
Germany                   9495
France                    8557
EIRE                      8196
Spain                     2533
Netherlands               2371
Belgium                   2069
Switzerland               2002
Portugal                  1519
Australia                 1259
Norway                    1086
Italy                      803
Channel Islands            758
Finland                    695
Cyprus                     622
Sweden                     462
Unspecified                446
Austria                    401
Denmark                    389
Japan                      358
Poland                     341
Israel                     297
USA                        291
Hong Kong                  288
Singapore                  229
Iceland                    182
Canada                     151
Greece                     146
Malta                      127
United Arab Emirates        68
European Community          61
RSA                         58
Lebanon                     45
Lithuania                   35
Brazil                      32
Czech Republic              30
Bahrain                     19
Saudi Arabia                10
Name: Country, dtype: int64


# choose only UK data
df = df.query("Country == 'United Kingdom'")

# remove unwanted columns
cols_drop = ['Country','Description','StockCode']
df = df.drop(cols_drop,axis=1)


# convert CamelCase to snake_case


df.columns

Index(['InvoiceNo', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID'], dtype='object')


# df.columns = ['invoice_no', 'quantity', 'invoice_date', 'unit_price', 'customer_id']


df = df.rename(columns=lambda x: re.sub(r'(?!^)([A-Z]+)',r'_\1',x))
df.columns = df.columns.str.lower()


df.columns

Index(['invoice_no', 'quantity', 'invoice_date', 'unit_price', 'customer_id'], dtype='object')


# re-arrange columns
cols = ['customer_id', 'invoice_no', 'invoice_date', 'quantity', 'unit_price']
df = df[cols]
df.head()


# create new feature
df['total_sales'] = df['quantity'] * df['unit_price']


df.describe()


"""
Observation:

1. We can see some -ve quantity values, these might be due to discount or
item returns.

2. But, here we are interested in life time value, and we choose only +ve values.

""";


n_before = df.shape[0]
df = df[df.quantity>0]
n_after = df.shape[0]

n_before-n_after

9192


n_before = df.shape[0]
df = df[df.total_sales>0]
n_after = df.shape[0]

n_before-n_after

1163


df.describe()


df.isna().sum()


df.head()


df.agg({
    'invoice_date':[min,max,sum,pd.Series.nunique],
    'customer_id': pd.Series.nunique,
    'quantity': sum,
    'revenue': sum
}).round(0).fillna('').replace(pd.NaT,'')


"""
Observation:

1. We have data from 2010 December 1 to 2011 Dec 9.

We have 3921 unique customers.

""";


import numpy as np
import pandas as pd

ifile = 'data/raw/online_retail.parquet.gzip'

df = pd.read_parquet(ifile)

# choose only UK data
df = df.query("Country == 'United Kingdom'")

# remove unwanted columns
cols_drop = ['Country','Description','StockCode']
df = df.drop(cols_drop,axis=1)

# make snake_case columns
df = df.rename(columns=lambda x: re.sub(r'(?!^)([A-Z]+)',r'_\1',x))
df.columns = df.columns.str.lower()

# re-arrange columns
cols = ['customer_id', 'invoice_no', 'invoice_date', 'quantity', 'unit_price']
df = df[cols]

# customer id must not be null
df = df.dropna(how='any',subset=['customer_id'])
df = df.query("customer_id != 'nan'")

# create new feature
df['total_sales'] = df['quantity'] * df['unit_price']

# filter only positive data
df = df[df.quantity>0]
df = df[df.total_sales>0]

# write parquet format (3MB)
df.to_parquet('data/processed/online_retail.parquet.gzip',compression='gzip')

# write hdf5 (12MB)
# df.to_hdf('data/processed/online_retail.h5', key='online_retail', mode='w',complevel=9)


df.dtypes


df.query("customer_id == 'nan'").shape


time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
0	536365	85123A	WHITE HANGING HEART T-LIGHT HOLDER	6	2010-12-01 08:26:00	2.55	17850.0	United Kingdom
1	536365	71053	WHITE METAL LANTERN	6	2010-12-01 08:26:00	3.39	17850.0	United Kingdom
2	536365	84406B	CREAM CUPID HEARTS COAT HANGER	8	2010-12-01 08:26:00	2.75	17850.0	United Kingdom
3	536365	84029G	KNITTED UNION FLAG HOT WATER BOTTLE	6	2010-12-01 08:26:00	3.39	17850.0	United Kingdom
4	536365	84029E	RED WOOLLY HOTTIE WHITE HEART.	6	2010-12-01 08:26:00	3.39	17850.0	United Kingdom
541904	581587	22613	PACK OF 20 SPACEBOY NAPKINS	12	2011-12-09 12:50:00	0.85	12680.0	France
541905	581587	22899	CHILDREN'S APRON DOLLY GIRL	6	2011-12-09 12:50:00	2.10	12680.0	France
541906	581587	23254	CHILDRENS CUTLERY DOLLY GIRL	4	2011-12-09 12:50:00	4.15	12680.0	France
541907	581587	23255	CHILDRENS CUTLERY CIRCUS PARADE	4	2011-12-09 12:50:00	4.15	12680.0	France
541908	581587	22138	BAKING SET 9 PIECE RETROSPOT	3	2011-12-09 12:50:00	4.95	12680.0	France

	customer_id	invoice_no	invoice_date	quantity	unit_price
0	17850.0	536365	2010-12-01 08:26:00	6	2.55
1	17850.0	536365	2010-12-01 08:26:00	6	3.39
2	17850.0	536365	2010-12-01 08:26:00	8	2.75
3	17850.0	536365	2010-12-01 08:26:00	6	3.39
4	17850.0	536365	2010-12-01 08:26:00	6	3.39

	quantity	unit_price	total_sales
count	495478.000000	495478.000000	495478.000000
mean	8.605486	4.532422	16.525065
std	227.588756	99.315438	394.839116
min	-80995.000000	-11062.060000	-168469.600000
25%	1.000000	1.250000	3.290000
50%	3.000000	2.100000	8.290000
75%	10.000000	4.130000	16.630000
max	80995.000000	38970.000000	168469.600000

Table of Contents

Description¶

Load the libraries¶

Data Cleaning¶

missing values¶

data description¶

Script¶

Time Taken¶

Feature	Description
InvoiceNo	Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter 'c', it indicates a cancellation.
StockCode	Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product.
Description	Product (item) name. Nominal.
Quantity	The quantities of each product (item) per transaction. Numeric.
InvoiceDate	Invice Date and time. Numeric, the day and time when each transaction was generated.
UnitPrice	Unit price. Numeric, Product price per unit in sterling.
CustomerID	Customer number. Nominal, a 5-digit integral number uniquely assigned to each customer.
Country	Country name. Nominal, the name of the country where each customer resides.