Introduction¶
In [1]:
import sys
print(sys.version)
print(sys.executable)
In [2]:
import pandas as pd
import numpy as np
import scipy
%load_ext watermark
%watermark -iv
Data¶
In [3]:
if 'google.colab' in sys.modules:
!wget -O books.csv "https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/master/books.csv"
!wget -O ratings.csv "https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/master/ratings.csv"
In [4]:
from pathlib import Path
if 'google.colab' not in sys.modules:
path_data = Path.home() / 'github/Recommender_System/data/goodbooks_10k'
books = pd.read_csv(path_data / 'books.csv').head(1000)
ratings = pd.read_csv(path_data / 'ratings.csv').head(5000)
print(books.shape)
print(ratings.shape)
display(books.head(2))
display(ratings.head(2))
In [5]:
print(books.shape)
print(books.columns)
books.head(2)
Out[5]:
In [6]:
books_cols = ['book_id', 'authors', 'original_publication_year', 'title', 'average_rating']
books2 = books[books_cols]
books2.head(2)
Out[6]:
In [7]:
print(ratings.shape)
print(ratings.columns)
ratings.head(2)
Out[7]:
In [8]:
df = pd.merge(books, ratings, on="book_id", how="inner")
print(df.shape)
df.head(2)
Out[8]:
Model based recommender engine: SVD¶
In [9]:
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
def svd_recommender(user_id, ratings_df, books_df, n_recommendations=5):
# Create user-item matrix
user_item_matrix = ratings_df.pivot_table(index='user_id', columns='book_id', values='rating').fillna(0)
# Convert to sparse matrix
sparse_matrix = csr_matrix(user_item_matrix.values)
# Normalize by user mean
user_means = np.array(user_item_matrix.mean(axis=1))
user_item_normalized = sparse_matrix - user_means.reshape(-1, 1)
# Perform SVD
U, sigma, Vt = svds(user_item_normalized, k=min(50, min(sparse_matrix.shape)-1))
sigma = np.diag(sigma)
# Reconstruct matrix
predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_means.reshape(-1, 1)
preds_df = pd.DataFrame(predicted_ratings,
columns=user_item_matrix.columns,
index=user_item_matrix.index)
# Get user predictions
user_preds = preds_df.loc[user_id].sort_values(ascending=False)
# Remove already rated
rated_books = ratings_df[ratings_df['user_id'] == user_id]['book_id']
user_preds = user_preds.drop(rated_books, errors='ignore')
# Get top recommendations
top_book_ids = user_preds.head(n_recommendations).index
# Ensure we're using the correct column name (either 'id' or 'book_id')
book_id_col = 'book_id'
df_out = books_df[books_df[book_id_col].isin(top_book_ids)]
return df_out
user_id = ratings['user_id'].iloc[0]
df_out = svd_recommender(user_id, ratings, books)
df_out
Out[9]:
In [ ]: