Introduction¶
In [2]:
import sys
print(sys.version)
print(sys.executable)
In [3]:
import pandas as pd
import numpy as np
Data¶
In [4]:
if 'google.colab' in sys.modules:
!wget -O books.csv "https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/master/books.csv"
!wget -O ratings.csv "https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/master/ratings.csv"
In [38]:
from pathlib import Path
if 'google.colab' not in sys.modules:
path_data = Path.home() / 'github/Recommender_System/data/goodbooks_10k'
books = pd.read_csv(path_data / 'books.csv').head(1000)
ratings = pd.read_csv(path_data / 'ratings.csv').head(5000)
print(books.shape)
print(ratings.shape)
display(books.head(2))
display(ratings.head(2))
In [7]:
print(books.shape)
print(books.columns)
books.head(2)
Out[7]:
In [8]:
books_cols = ['book_id', 'authors', 'original_publication_year', 'title', 'average_rating']
books2 = books[books_cols]
books2.head(2)
Out[8]:
In [9]:
print(ratings.shape)
print(ratings.columns)
ratings.head(2)
Out[9]:
In [10]:
df = pd.merge(books, ratings, on="book_id", how="inner")
print(df.shape)
df.head(2)
Out[10]:
Method 01: Collaborative Filtering: Item (book) based¶
Item-based recommendation is a type of recommendation system that suggests items to users based on the similarities between items themselves, rather than on the preferences or behaviors of other users. It is a commonly used technique in the field of recommender systems, which are used in various online platforms to suggest products, services, or content to users based on their preferences or behaviors.
References:
In [11]:
user_book_df = df.groupby(["user_id","title"])["rating"].mean().unstack().notnull()
user_book_df.head(2)
Out[11]:
In [12]:
df.head(2).T
Out[12]:
In [13]:
book_title = user_book_df.columns[0]
book_title
Out[13]:
In [14]:
sample_df = user_book_df[book_title]
print(sample_df.shape)
sample_df.head(2)
Out[14]:
In [15]:
user_book_df.corrwith(sample_df).sort_values(ascending=False).head(5)
Out[15]:
method 02. Collaborative Filtering: User Based¶
In [16]:
from sklearn.metrics.pairwise import cosine_similarity
def item_based_cf(user_id, ratings_df, books_df, n_recommendations=5):
# Create user-item matrix
user_item_matrix = ratings_df.pivot_table(index='user_id', columns='book_id', values='rating').fillna(0)
# Calculate item-item similarity
item_similarity = cosine_similarity(user_item_matrix.T)
item_similarity_df = pd.DataFrame(item_similarity,
index=user_item_matrix.columns,
columns=user_item_matrix.columns)
# Get user's rated items
user_ratings = ratings_df[ratings_df['user_id'] == user_id]
# Calculate weighted scores
recommendations = pd.Series(dtype='float64')
for book_id in user_ratings['book_id']:
similar_books = item_similarity_df[book_id]
user_rating = user_ratings.loc[user_ratings['book_id'] == book_id, 'rating'].values[0]
recommendations = recommendations.add(similar_books * user_rating, fill_value=0)
# Remove already rated items
recommendations = recommendations.drop(user_ratings['book_id'])
# Get top recommendations
top_book_ids = recommendations.sort_values(ascending=False).head(n_recommendations).index
return books_df[books_df['book_id'].isin(top_book_ids)]
# Usage
user_id = ratings['user_id'].iloc[0]
df_out = item_based_cf(1, ratings, books)
df_out
Out[16]:
Results¶
In [18]:
print(user_id)
In [19]:
ratings[ratings['user_id'] == user_id]
Out[19]:
In [20]:
df_out['image_url'].values
Out[20]:
In [21]:
from IPython.display import Image, HTML
def path_to_image_html(path):
return '<img src="'+ path + '""/>'
In [22]:
cols = ['image_url', 'title', 'authors', 'original_publication_year', 'average_rating']
HTML(df_out[cols].to_html(escape=False, formatters=dict(image_url=path_to_image_html), justify='center'))
Out[22]:
Method 03: Collaborative Filtering: Item (Book) Based using multiple columns¶
In [23]:
books2 = books.fillna('')
In [24]:
def clean_data(x):
return str.lower(x.replace(" ",""))
In [25]:
features = ['original_title','authors','average_rating']
books2=books2[features]
In [26]:
books2 = books2.astype(str)
books2.dtypes
Out[26]:
In [27]:
for feature in features:
books2[feature] = books2[feature].astype(str).apply(clean_data)
books2.head(2)
Out[27]:
In [28]:
def create_soup(x):
return x['original_title']+ ' ' + x['authors']+ ' ' + x['average_rating']
In [29]:
books2['soup'] = books2.apply(create_soup, axis=1)
In [30]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(books2['soup'])
cosine_sim = cosine_similarity(count_matrix,count_matrix)
In [31]:
print(type(cosine_sim))
print(cosine_sim.shape)
In [32]:
books2 = books2.reset_index()
indices= pd.Series(books2.index, index=books2['original_title'])
indices.head(2)
Out[32]:
In [33]:
def get_recommendations_new(title, cosine_sim=cosine_sim):
title=title.replace(' ','').lower()
idx = indices[title]
# Get the pairwsie similarity scores of all books with that book
sim_scores = list(enumerate(cosine_sim[idx]))
# Sort the books based on the similarity scores
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
# Get the scores of the 10 most similar books
sim_scores = sim_scores[1:11]
# Get the book indices
book_indices = [i[0] for i in sim_scores]
# Return the top 10 most similar books
return list(books['original_title'].iloc[book_indices])
In [34]:
books2['original_title'].head(2)
Out[34]:
In [35]:
books['original_title'].head(2)
Out[35]:
In [36]:
book_title = books['original_title'].iloc[0]
lst = get_recommendations_new(book_title, cosine_sim)
print('Given Book: ', book_title)
print('Recommended Books: ')
print(lst)
In [37]:
cols = ['image_url', 'title', 'authors', 'original_publication_year', 'average_rating']
df_out = books[books['original_title'].isin(lst)]
HTML(df_out[cols].to_html(escape=False, formatters=dict(image_url=path_to_image_html), justify='center'))
Out[37]:
In [ ]: