Introduction¶
In [1]:
import sys
print(sys.version)
print(sys.executable)
In [2]:
import pandas as pd
import numpy as np
import scipy
%load_ext watermark
%watermark -iv
Data¶
In [3]:
if 'google.colab' in sys.modules:
!wget -O books.csv "https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/master/books.csv"
!wget -O ratings.csv "https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/master/ratings.csv"
In [4]:
from pathlib import Path
if 'google.colab' not in sys.modules:
path_data = Path.home() / 'github/Recommender_System/data/goodbooks_10k'
books = pd.read_csv(path_data / 'books.csv').head(1000)
ratings = pd.read_csv(path_data / 'ratings.csv').head(5000)
print(books.shape)
print(ratings.shape)
display(books.head(2))
display(ratings.head(2))
In [5]:
print(books.shape)
print(books.columns)
books.head(2)
Out[5]:
In [6]:
books_cols = ['book_id', 'authors', 'original_publication_year', 'title', 'average_rating']
books2 = books[books_cols]
books2.head(2)
Out[6]:
In [7]:
print(ratings.shape)
print(ratings.columns)
ratings.head(2)
Out[7]:
In [8]:
df = pd.merge(books, ratings, on="book_id", how="inner")
print(df.shape)
df.head(2)
Out[8]:
Content-Based Filtering¶
In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
def content_based_recommender(user_id, ratings_df, books_df, n_recommendations=5):
# Ensure we're using the correct column names
book_id_col = 'book_id' if 'book_id' in books_df.columns else 'id'
# Create TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english')
books_df['description'] = books_df['title'].fillna('') + " " + books_df['authors'].fillna('')
tfidf_matrix = tfidf.fit_transform(books_df['description'])
# Compute cosine similarities
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
# Get user's liked books (rated 4+ stars)
liked_books = ratings_df[(ratings_df['user_id'] == user_id) &
(ratings_df['rating'] >= 4)]['book_id'].unique()
# Filter books that exist in both datasets
valid_books = set(books_df[book_id_col]).intersection(set(liked_books))
if not valid_books:
print(f"No valid books found for user {user_id}. Returning popular books instead.")
return books_df.sort_values(by='ratings_count', ascending=False).head(n_recommendations)
# Aggregate similarities
sim_scores = pd.Series(0, index=books_df[book_id_col])
for book_id in valid_books:
try:
idx = books_df.index[books_df[book_id_col] == book_id].tolist()[0]
sim_scores += pd.Series(cosine_sim[idx], index=books_df[book_id_col])
except IndexError:
continue
# Remove already liked
sim_scores = sim_scores.drop(liked_books, errors='ignore')
# Get top recommendations
top_book_ids = sim_scores.sort_values(ascending=False).head(n_recommendations).index
return books_df[books_df[book_id_col].isin(top_book_ids)][[book_id_col, 'title', 'authors']]
user_id = ratings['user_id'].iloc[0] # Or any specific user ID
recommendations = content_based_recommender(user_id, ratings, books)
recommendations
Out[9]:
In [ ]: