Online shops often sell tons of different items and this can become very messy very quickly!
Data science can be extremely useful to automatically organize the products in categories so that they can be easily found by the customers.
The goal of this challenge is to look at user purchase history and create categories of items that are likely to be bought together and, therefore, should belong to the same section.
Company XYZ is an online grocery store. In the current version of the website, they have manually grouped the items into a few categories based on their experience.
However, they now have a lot of data about user purchase history. Therefore, they would like to put the data into use!
This is what they asked you to do:
a. The customer who bought the most items overall in her lifetime
b. For each item, the customer who bought that product the most
import time
time_start_notebook = time.time()
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
pd.options.display.max_columns = 100
SEED = 100
np.random.seed(SEED) # we need this in each cell that calls random
plt.style.use('ggplot')
%matplotlib inline
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
!ls data/
df_item = pd.read_csv('data/item_to_id.csv')
print(df_item.shape)
df_item.head(2).append(df_item.tail(2))
df_item['Item_id'].nunique(), df_item.shape[0] # we have 48 different item_id
df_item['Item_name'].nunique(), df_item.shape[0] # we have 48 different item_id
df_item.set_index('Item_id',inplace=True)
df_item.head(2)
# df_item['Item_name'].to_dict()
df = pd.read_csv('data/purchase_history.csv')
print(df.shape)
df.head(2).append(df.tail(2)) # we have 39k customers
df['user_id'].nunique(), df.shape[0]
# user_id is not unique, same user has shopped more than once.
df['id'] = df['id'].astype(str).str.split(',')
df.head(2)
df = df.explode('id')
df = pd.crosstab(df['user_id'], df['id'],margins=True,margins_name='total')
df.head()
df.columns
# df_item['Item_name'].to_dict()
mapping = df_item['Item_name'].to_dict()
mapping = {str(k): v for k,v in mapping.items()} # make string
# mapping
df.columns = df.columns.map(mapping).fillna('item_total')
df.head(2)
df.index = 'user_' + df.index.astype(str)
df.head(2).append(df.tail(2))
df.iloc[:-1].nlargest(5,'item_total').iloc[:,-1:]
"""
This is the list of top5 users who bought the most items.
""";
df.head(2).append(df.tail(2))
df.iloc[:-1].T.idxmax(axis=1).to_frame('user').head()
df.iloc[:-1].apply(lambda s: pd.Series(
[s.idxmax(), s.max()],
index=['max_user','max_count']
)).T.head()
# validate the result
df.head(2)
df.loc['user_31625'].to_frame().T # this user has bought 4 sugars in total
df[['sugar']].T.filter(regex='user_31625')
Cluster items based on user co-purchase history. That is, create clusters of products that have the highest probability of being bought together. The goal of this is to replace the old/manually created categories with these new ones. Each item can belong to just one cluster.
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
df.head(2).append(df.tail(2)).iloc[:,-5:]
df.shape
# remove colmun and index named total
df = df.iloc[:-1,:-1]
print(df.shape)
df.head(2).append(df.tail(2)).iloc[:,-5:]
from sklearn.preprocessing import normalize
item_norm = normalize(df,axis=0) # normalize each items (NOT users)
item_sim = item_norm.T.dot(item_norm)
df_item_sim = pd.DataFrame(item_sim,
index=df.columns,
columns=df.columns)
print(df_item_sim.shape)
df_item_sim.head(2)
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
kmeans = KMeans()
kmeans
inertias = []
silhouettes = []
ks = range(2,30)
for k in ks:
kmeans = KMeans(n_clusters=k,random_state=SEED,
init='k-means++',n_jobs=-1)
kmeans.fit(df_item_sim)
inertias.append(kmeans.inertia_)
silhouettes.append(silhouette_score(df_item_sim, kmeans.predict(df_item_sim)))
fig, (ax1, ax2) = plt.subplots(1,2,figsize=(20,8))
ax1.plot(ks, inertias,marker='o')
ax1.set_xticks(range(len(ks)+3));
ax1.set_title('Plot of inertia')
ax1.set_xlabel('Number of cluster')
ax1.set_ylabel('Inertia');
# silhoutte plot
ax2.plot(ks,silhouettes,marker='o')
ax2.set_xticks(range(len(ks)+3));
ax2.set_title('Plot of Silhouettes')
ax2.set_xlabel('Number of cluster')
ax2.set_ylabel('Silhouette');
ax2.axvline(15,color='b',ls='--');
n_clusters = 15
kmeans = KMeans(n_clusters=n_clusters,random_state=SEED,
init='k-means++',n_jobs=-1)
kmeans.fit(df_item_sim)
kmeans.labels_
from sklearn.decomposition import PCA
pca = PCA()
pca
pca = PCA(n_components=2,random_state=SEED)
arr_pca = pca.fit_transform(df_item_sim)
arr_pca.shape
df_pca = pd.DataFrame(arr_pca, columns=['pc_0', 'pc_1'],
index=df_item_sim.index.to_numpy())
df_pca.head(2)
kmeans.labels_
len(kmeans.labels_)
colors_dict = dict(enumerate(sns.color_palette('Set2',n_clusters)))
# colors_dict
mycolors = [colors_dict[k] for k in kmeans.labels_]
# mycolors
colors_dict = dict(enumerate(sns.color_palette('magma',n_clusters)))
mycolors = [colors_dict[k] for k in kmeans.labels_]
fig, ax = plt.subplots(figsize=(20,20))
df_pca.plot.scatter(x='pc_0',y='pc_1',color=mycolors,ax=ax)
i = 0
for name,(x,y) in df_pca.iloc[:,:2].iterrows():
ax.annotate(name,[x,y],
xytext=(10,-5),
textcoords='offset points',
size=24,
color=mycolors[i])
i+=1
df_item_sim.columns
kmeans.labels_
df_labels = pd.DataFrame({
'item': df_item_sim.columns.to_numpy(),
'label': kmeans.labels_
})
df_labels.head(2)
pd.set_option('display.max_colwidth', -1)
df_labels.groupby('label')['item'].apply(list).to_frame()
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
import subprocess
subprocess.call(['python', '-m', 'nbconvert', '*.ipynb'])