Recommender System¶
References:
01: Data¶
import sys
import numpy as np
import pandas as pd
print(sys.version)
print(sys.executable)
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import os
if 'google.colab' in sys.modules:
!wget -O books.csv "https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/master/books.csv"
!wget -O ratings.csv "https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/master/ratings.csv"
from pathlib import Path
if 'google.colab' not in sys.modules:
path_data = Path.home() / 'github/Recommender_System/data/goodbooks_10k'
books = pd.read_csv(path_data / 'books.csv').head(1000)
ratings = pd.read_csv(path_data / 'ratings.csv').head(5000)
print(books.shape)
print(ratings.shape)
display(books.head(2))
display(ratings.head(2))
import os
import torch
import pandas as pd
from torch import nn
from torch.utils.data import DataLoader, Dataset
# Define device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# First merge the dataframes as you requested
df = books.merge(ratings, how='inner',on='book_id')
df.head(2)
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os
# Assuming you have these defined somewhere
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class BookRecommender(nn.Module):
def __init__(self, num_users, num_books, emb_dim=50):
super().__init__()
self.user_emb = nn.Embedding(num_users, emb_dim)
self.book_emb = nn.Embedding(num_books, emb_dim)
self.fc = nn.Linear(emb_dim, 1)
def forward(self, user, book):
user_emb = self.user_emb(user)
book_emb = self.book_emb(book)
pred = torch.sigmoid(self.fc(user_emb * book_emb))
return pred.squeeze()
class BookRatingDataset(Dataset):
def __init__(self, df):
self.users = torch.LongTensor(df['user_id_encoded'].values)
self.books = torch.LongTensor(df['book_id_encoded'].values)
self.ratings = torch.FloatTensor(df['rating'].values)
def __len__(self):
return len(self.users)
def __getitem__(self, idx):
return self.users[idx], self.books[idx], self.ratings[idx]
def prepare_data(df):
"""Prepare and normalize the data with the merged dataframe"""
# Create consistent encoded IDs
df['user_id_encoded'] = df['user_id'].astype('category').cat.codes
df['book_id_encoded'] = df['book_id'].astype('category').cat.codes
# Create mappings
user_id_mapping = dict(enumerate(df['user_id'].astype('category').cat.categories))
book_id_mapping = dict(enumerate(df['book_id'].astype('category').cat.categories))
# Create reverse mappings
reverse_user_mapping = {v: k for k, v in user_id_mapping.items()}
reverse_book_mapping = {v: k for k, v in book_id_mapping.items()}
return df, user_id_mapping, book_id_mapping, reverse_user_mapping, reverse_book_mapping
def make_recommendations(user_id, df, n_recs=5):
"""
Generate book recommendations using only the merged dataframe
"""
try:
# Verify required columns exist
required_cols = ['user_id', 'book_id', 'rating', 'title', 'authors']
if not all(col in df.columns for col in required_cols):
missing = [col for col in required_cols if col not in df.columns]
raise ValueError(f"Dataframe missing required columns: {missing}")
# Create encoded mappings
unique_users = df['user_id'].unique()
unique_books = df['book_id'].unique()
user_to_encoded = {uid: i for i, uid in enumerate(sorted(unique_users))}
book_to_encoded = {bid: i for i, bid in enumerate(sorted(unique_books))}
encoded_to_book = {i: bid for bid, i in book_to_encoded.items()}
# Validate user exists
if user_id not in user_to_encoded:
print(f"User {user_id} not found in data. Available users: {list(unique_users)}")
print("Returning popular books.")
popular = df['book_id'].value_counts().head(n_recs).index
return df[df['book_id'].isin(popular)][['book_id', 'title', 'authors']].drop_duplicates()
# Get unrated books
user_rated = set(df[df['user_id'] == user_id]['book_id'])
unrated_books = [bid for bid in unique_books if bid not in user_rated]
if not unrated_books:
print(f"User {user_id} has rated all books in the system. Returning popular books.")
popular = df['book_id'].value_counts().head(n_recs).index
return df[df['book_id'].isin(popular)][['book_id', 'title', 'authors']].drop_duplicates()
# Model Setup
n_users = len(user_to_encoded)
n_books = len(book_to_encoded)
model = BookRecommender(n_users, n_books).to(device)
# Load or train model
checkpoint_path = 'book_recommender_state.pt'
if os.path.exists(checkpoint_path):
checkpoint = torch.load(checkpoint_path)
if (checkpoint['n_users'] == n_users and
checkpoint['n_books'] == n_books):
model.load_state_dict(checkpoint['model_state_dict'])
else:
print("Model dimensions don't match data. Retraining...")
model = train_model(df, n_users, n_books)
else:
print("No model found. Training new model...")
model = train_model(df, n_users, n_books)
model.eval()
# Generate Recommendations
user_encoded = user_to_encoded[user_id]
valid_book_encodings = []
valid_book_ids = []
for bid in unrated_books:
if bid in book_to_encoded:
valid_book_encodings.append(book_to_encoded[bid])
valid_book_ids.append(bid)
if not valid_book_encodings:
print("No valid books found for recommendations")
return df[['book_id', 'title', 'authors']].head(0)
user_tensor = torch.LongTensor([user_encoded] * len(valid_book_encodings)).to(device)
book_tensor = torch.LongTensor(valid_book_encodings).to(device)
with torch.no_grad():
preds = model(user_tensor, book_tensor)
# Get top recommendations
actual_recs = min(n_recs, len(valid_book_encodings))
_, top_indices = torch.topk(preds, actual_recs)
recommended_book_encodeds = book_tensor[top_indices].cpu().numpy()
recommended_book_ids = [encoded_to_book[e] for e in recommended_book_encodeds]
results = df[df['book_id'].isin(recommended_book_ids)]
return results[['book_id', 'title', 'authors']].drop_duplicates().head(n_recs)
except Exception as e:
print(f"Error generating recommendations: {str(e)}")
return df[['book_id', 'title', 'authors']].head(0)
def train_model(df, n_users, n_books, epochs=5):
"""Train model using the merged dataframe"""
try:
if len(df) == 0:
raise ValueError("No data available for training")
# Prepare data for training
df = df.copy()
df['user_id_encoded'] = df['user_id'].astype('category').cat.codes
df['book_id_encoded'] = df['book_id'].astype('category').cat.codes
dataset = BookRatingDataset(df)
loader = DataLoader(dataset, batch_size=64, shuffle=True)
model = BookRecommender(n_users, n_books).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()
for epoch in range(epochs):
model.train()
total_loss = 0
for users, books, ratings in loader:
if users.max() >= n_users or books.max() >= n_books:
raise ValueError(f"Invalid index: users max {users.max()}, books max {books.max()}")
users, books, ratings = users.to(device), books.to(device), ratings.float().to(device)
optimizer.zero_grad()
preds = model(users, books)
loss = criterion(preds, ratings)
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader):.4f}")
# Save model
# torch.save({
# 'model_state_dict': model.state_dict(),
# 'n_users': n_users,
# 'n_books': n_books,
# 'emb_dim': model.user_emb.embedding_dim
# }, 'book_recommender_state.pt')
return model
except Exception as e:
print(f"Error during training: {str(e)}")
raise
# Example usage:
# First prepare the data (only need to do this once)
df_prepared, _, _, _, _ = prepare_data(df)
# Then make recommendations
recommendations = make_recommendations(user_id=2, df=df_prepared)
display(recommendations)
ratings['rating'].value_counts().sort_index()
Key Differences from TensorFlow Version:¶
PyTorch Components:
- Uses
nn.Embedding
instead of Keras Embedding layers - Implements custom
Dataset
andDataLoader
for batching - Manual training loop with explicit gradient zeroing and backpropagation
- Uses
Model Architecture:
- Same dot product approach but implemented as
(u * b).sum(dim=1)
- Embedding weights initialized with small random values
- Same dot product approach but implemented as
Training Process:
- Explicit batch processing
- Manual loss calculation and backpropagation
- Model modes (
train()
andeval()
) for proper dropout/batch norm handling
Recommendation Function:
- Uses PyTorch's
topk()
for efficient recommendation selection - Moves tensors to GPU if available
- Uses PyTorch's
How It Works:¶
Data Preparation:
- Creates mapping between user/book IDs and embedding indices
- Splits data into train/test sets
Model Training:
- Learns embeddings that minimize rating prediction error
- Uses Adam optimizer and MSE loss (same as TF version)
Making Recommendations:
- For a given user, predicts ratings for all books
- Selects top 5 highest predicted ratings
- Returns book details from the books.csv file
This implementation maintains the same collaborative filtering approach but gives you more low-level control through PyTorch's imperative programming style. The recommendations will be similar in quality to the TensorFlow version.
In PyTorch you can save the trained model, but instead of the .h5
format used by Keras/TensorFlow, PyTorch typically uses .pt
or .pth
file extensions. Here's how to modify the code to save and load the model:
Key Differences from TensorFlow's .h5:¶
File Formats:
- PyTorch:
.pt
or.pth
(pickle-based) - TensorFlow:
.h5
(HDF5-based)
- PyTorch:
Saving Options:
- Entire model:
torch.save(model, 'file.pt')
(like TF's model.save()) - State dictionary:
model.state_dict()
(more flexible)
- Entire model:
Loading Requirements:
- Need the model class definition when loading state_dict
- Need to call
model.eval()
for inference
Additional Info:
- PyTorch often saves optimizer state and other metadata
- Can save on GPU and load on CPU with
map_location
parameter
Best Practices:¶
- For production, save
state_dict
rather than entire model - Include all necessary metadata (like n_users, n_books)
- Handle device mapping (GPU/CPU) when loading
- Use
model.eval()
before inference
This implementation gives you the same functionality as the TensorFlow version but with PyTorch's more flexible serialization approach.