Recommender System¶

References:

01: Data¶

import sys
import numpy as np
import pandas as pd

print(sys.version)
print(sys.executable)

3.12.10 | packaged by conda-forge | (main, Apr 10 2025, 22:08:16) [MSC v.1943 64 bit (AMD64)]
C:\Users\Sumedha\.conda\envs\py312\python.exe

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import os

if 'google.colab' in sys.modules:
    !wget -O books.csv "https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/master/books.csv"
    !wget -O ratings.csv "https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/master/ratings.csv"

from pathlib import Path
if 'google.colab' not in sys.modules:
    path_data = Path.home() / 'github/Recommender_System/data/goodbooks_10k'
    books = pd.read_csv(path_data / 'books.csv').head(1000)
    ratings = pd.read_csv(path_data / 'ratings.csv').head(5000)

print(books.shape)
print(ratings.shape)

display(books.head(2))
display(ratings.head(2))

(1000, 23)
(5000, 3)

import os
import torch
import pandas as pd
from torch import nn
from torch.utils.data import DataLoader, Dataset

# Define device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# First merge the dataframes as you requested
df = books.merge(ratings, how='inner',on='book_id')

df.head(2)

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os

# Assuming you have these defined somewhere
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class BookRecommender(nn.Module):
    def __init__(self, num_users, num_books, emb_dim=50):
        super().__init__()
        self.user_emb = nn.Embedding(num_users, emb_dim)
        self.book_emb = nn.Embedding(num_books, emb_dim)
        self.fc = nn.Linear(emb_dim, 1)
        
    def forward(self, user, book):
        user_emb = self.user_emb(user)
        book_emb = self.book_emb(book)
        pred = torch.sigmoid(self.fc(user_emb * book_emb))
        return pred.squeeze()

class BookRatingDataset(Dataset):
    def __init__(self, df):
        self.users = torch.LongTensor(df['user_id_encoded'].values)
        self.books = torch.LongTensor(df['book_id_encoded'].values)
        self.ratings = torch.FloatTensor(df['rating'].values)
        
    def __len__(self):
        return len(self.users)
    
    def __getitem__(self, idx):
        return self.users[idx], self.books[idx], self.ratings[idx]

def prepare_data(df):
    """Prepare and normalize the data with the merged dataframe"""
    # Create consistent encoded IDs
    df['user_id_encoded'] = df['user_id'].astype('category').cat.codes
    df['book_id_encoded'] = df['book_id'].astype('category').cat.codes
    
    # Create mappings
    user_id_mapping = dict(enumerate(df['user_id'].astype('category').cat.categories))
    book_id_mapping = dict(enumerate(df['book_id'].astype('category').cat.categories))
    
    # Create reverse mappings
    reverse_user_mapping = {v: k for k, v in user_id_mapping.items()}
    reverse_book_mapping = {v: k for k, v in book_id_mapping.items()}
    
    return df, user_id_mapping, book_id_mapping, reverse_user_mapping, reverse_book_mapping

def make_recommendations(user_id, df, n_recs=5):
    """
    Generate book recommendations using only the merged dataframe
    """
    try:
        # Verify required columns exist
        required_cols = ['user_id', 'book_id', 'rating', 'title', 'authors']
        if not all(col in df.columns for col in required_cols):
            missing = [col for col in required_cols if col not in df.columns]
            raise ValueError(f"Dataframe missing required columns: {missing}")

        # Create encoded mappings
        unique_users = df['user_id'].unique()
        unique_books = df['book_id'].unique()
        
        user_to_encoded = {uid: i for i, uid in enumerate(sorted(unique_users))}
        book_to_encoded = {bid: i for i, bid in enumerate(sorted(unique_books))}
        encoded_to_book = {i: bid for bid, i in book_to_encoded.items()}

        # Validate user exists
        if user_id not in user_to_encoded:
            print(f"User {user_id} not found in data. Available users: {list(unique_users)}")
            print("Returning popular books.")
            popular = df['book_id'].value_counts().head(n_recs).index
            return df[df['book_id'].isin(popular)][['book_id', 'title', 'authors']].drop_duplicates()

        # Get unrated books
        user_rated = set(df[df['user_id'] == user_id]['book_id'])
        unrated_books = [bid for bid in unique_books if bid not in user_rated]
        
        if not unrated_books:
            print(f"User {user_id} has rated all books in the system. Returning popular books.")
            popular = df['book_id'].value_counts().head(n_recs).index
            return df[df['book_id'].isin(popular)][['book_id', 'title', 'authors']].drop_duplicates()

        # Model Setup
        n_users = len(user_to_encoded)
        n_books = len(book_to_encoded)
        
        model = BookRecommender(n_users, n_books).to(device)
        
        # Load or train model
        checkpoint_path = 'book_recommender_state.pt'
        if os.path.exists(checkpoint_path):
            checkpoint = torch.load(checkpoint_path)
            if (checkpoint['n_users'] == n_users and 
                checkpoint['n_books'] == n_books):
                model.load_state_dict(checkpoint['model_state_dict'])
            else:
                print("Model dimensions don't match data. Retraining...")
                model = train_model(df, n_users, n_books)
        else:
            print("No model found. Training new model...")
            model = train_model(df, n_users, n_books)
        
        model.eval()

        # Generate Recommendations
        user_encoded = user_to_encoded[user_id]
        valid_book_encodings = []
        valid_book_ids = []
        
        for bid in unrated_books:
            if bid in book_to_encoded:
                valid_book_encodings.append(book_to_encoded[bid])
                valid_book_ids.append(bid)
        
        if not valid_book_encodings:
            print("No valid books found for recommendations")
            return df[['book_id', 'title', 'authors']].head(0)

        user_tensor = torch.LongTensor([user_encoded] * len(valid_book_encodings)).to(device)
        book_tensor = torch.LongTensor(valid_book_encodings).to(device)
        
        with torch.no_grad():
            preds = model(user_tensor, book_tensor)
        
        # Get top recommendations
        actual_recs = min(n_recs, len(valid_book_encodings))
        _, top_indices = torch.topk(preds, actual_recs)
        
        recommended_book_encodeds = book_tensor[top_indices].cpu().numpy()
        recommended_book_ids = [encoded_to_book[e] for e in recommended_book_encodeds]
        
        results = df[df['book_id'].isin(recommended_book_ids)]
        return results[['book_id', 'title', 'authors']].drop_duplicates().head(n_recs)

    except Exception as e:
        print(f"Error generating recommendations: {str(e)}")
        return df[['book_id', 'title', 'authors']].head(0)

def train_model(df, n_users, n_books, epochs=5):
    """Train model using the merged dataframe"""
    try:
        if len(df) == 0:
            raise ValueError("No data available for training")
            
        # Prepare data for training
        df = df.copy()
        df['user_id_encoded'] = df['user_id'].astype('category').cat.codes
        df['book_id_encoded'] = df['book_id'].astype('category').cat.codes
        
        dataset = BookRatingDataset(df)
        loader = DataLoader(dataset, batch_size=64, shuffle=True)
        
        model = BookRecommender(n_users, n_books).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        criterion = nn.MSELoss()
        
        for epoch in range(epochs):
            model.train()
            total_loss = 0
            for users, books, ratings in loader:
                if users.max() >= n_users or books.max() >= n_books:
                    raise ValueError(f"Invalid index: users max {users.max()}, books max {books.max()}")
                
                users, books, ratings = users.to(device), books.to(device), ratings.float().to(device)
                
                optimizer.zero_grad()
                preds = model(users, books)
                loss = criterion(preds, ratings)
                loss.backward()
                optimizer.step()
                
                total_loss += loss.item()
            
            print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader):.4f}")
        
        # Save model
        # torch.save({
        #     'model_state_dict': model.state_dict(),
        #     'n_users': n_users,
        #     'n_books': n_books,
        #     'emb_dim': model.user_emb.embedding_dim
        # }, 'book_recommender_state.pt')
        
        return model
   
    except Exception as e:
        print(f"Error during training: {str(e)}")
        raise

# Example usage:
# First prepare the data (only need to do this once)
df_prepared, _, _, _, _ = prepare_data(df)

# Then make recommendations
recommendations = make_recommendations(user_id=2, df=df_prepared)
display(recommendations)

No model found. Training new model...
Epoch 1, Loss: 12.1666
Epoch 2, Loss: 12.0537
Epoch 3, Loss: 11.9618
Epoch 4, Loss: 11.8940
Epoch 5, Loss: 11.7762

ratings['rating'].value_counts().sort_index()

rating
1      87
2     367
3    1228
4    1881
5    1437
Name: count, dtype: int64

Key Differences from TensorFlow Version:¶

PyTorch Components:
- Uses nn.Embedding instead of Keras Embedding layers
- Implements custom Dataset and DataLoader for batching
- Manual training loop with explicit gradient zeroing and backpropagation
Model Architecture:
- Same dot product approach but implemented as (u * b).sum(dim=1)
- Embedding weights initialized with small random values
Training Process:
- Explicit batch processing
- Manual loss calculation and backpropagation
- Model modes (train() and eval()) for proper dropout/batch norm handling
Recommendation Function:
- Uses PyTorch's topk() for efficient recommendation selection
- Moves tensors to GPU if available

How It Works:¶

Data Preparation:
- Creates mapping between user/book IDs and embedding indices
- Splits data into train/test sets
Model Training:
- Learns embeddings that minimize rating prediction error
- Uses Adam optimizer and MSE loss (same as TF version)
Making Recommendations:
- For a given user, predicts ratings for all books
- Selects top 5 highest predicted ratings
- Returns book details from the books.csv file

This implementation maintains the same collaborative filtering approach but gives you more low-level control through PyTorch's imperative programming style. The recommendations will be similar in quality to the TensorFlow version.

In PyTorch you can save the trained model, but instead of the .h5 format used by Keras/TensorFlow, PyTorch typically uses .pt or .pth file extensions. Here's how to modify the code to save and load the model:

Key Differences from TensorFlow's .h5:¶

File Formats:
- PyTorch: .pt or .pth (pickle-based)
- TensorFlow: .h5 (HDF5-based)
Saving Options:
- Entire model: torch.save(model, 'file.pt') (like TF's model.save())
- State dictionary: model.state_dict() (more flexible)
Loading Requirements:
- Need the model class definition when loading state_dict
- Need to call model.eval() for inference
Additional Info:
- PyTorch often saves optimizer state and other metadata
- Can save on GPU and load on CPU with map_location parameter

Best Practices:¶

For production, save state_dict rather than entire model
Include all necessary metadata (like n_users, n_books)
Handle device mapping (GPU/CPU) when loading
Use model.eval() before inference

This implementation gives you the same functionality as the TensorFlow version but with PyTorch's more flexible serialization approach.

	book_id	goodreads_book_id	best_book_id	work_id	books_count	isbn	isbn13	authors	original_publication_year	original_title	...	ratings_count	work_ratings_count	work_text_reviews_count	ratings_1	ratings_2	ratings_3	ratings_4	ratings_5	image_url	small_image_url
0	1	2767052	2767052	2792775	272	439023483	9.780439e+12	Suzanne Collins	2008.0	The Hunger Games	...	4780653	4942365	155254	66715	127936	560092	1481305	2706317	https://images.gr-assets.com/books/1447303603m...	https://images.gr-assets.com/books/1447303603s...
1	2	3	3	4640799	491	439554934	9.780440e+12	J.K. Rowling, Mary GrandPré	1997.0	Harry Potter and the Philosopher's Stone	...	4602479	4800065	75867	75504	101676	455024	1156318	3011543	https://images.gr-assets.com/books/1474154022m...	https://images.gr-assets.com/books/1474154022s...

	book_id	goodreads_book_id	best_book_id	work_id	books_count	isbn	isbn13	authors	original_publication_year	original_title	...	work_text_reviews_count	ratings_1	ratings_2	ratings_3	ratings_4	ratings_5	image_url	small_image_url	user_id	rating
0	2	3	3	4640799	491	439554934	9.780440e+12	J.K. Rowling, Mary GrandPré	1997.0	Harry Potter and the Philosopher's Stone	...	75867	75504	101676	455024	1156318	3011543	https://images.gr-assets.com/books/1474154022m...	https://images.gr-assets.com/books/1474154022s...	4	5
1	2	3	3	4640799	491	439554934	9.780440e+12	J.K. Rowling, Mary GrandPré	1997.0	Harry Potter and the Philosopher's Stone	...	75867	75504	101676	455024	1156318	3011543	https://images.gr-assets.com/books/1474154022m...	https://images.gr-assets.com/books/1474154022s...	15	4

	book_id	title	authors
702	38	The Time Traveler's Wife	Audrey Niffenegger
1119	101	Me Talk Pretty One Day	David Sedaris
1591	198	The Color Purple	Alice Walker
2052	366	John Adams	David McCullough
2972	941	The Hours	Michael Cunningham