Introduction

In [1]:
import sys
print(sys.version)
print(sys.executable)
3.12.10 | packaged by conda-forge | (main, Apr 10 2025, 22:08:16) [MSC v.1943 64 bit (AMD64)]
C:\Users\Sumedha\.conda\envs\py312\python.exe
In [2]:
import pandas as pd
import numpy as np
import scipy

%load_ext watermark
%watermark -iv
pandas: 2.2.3
scipy : 1.15.3
sys   : 3.12.10 | packaged by conda-forge | (main, Apr 10 2025, 22:08:16) [MSC v.1943 64 bit (AMD64)]
numpy : 1.26.4

Data

In [3]:
if 'google.colab' in sys.modules:
    !wget -O books.csv "https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/master/books.csv"
    !wget -O ratings.csv "https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/master/ratings.csv"
In [4]:
from pathlib import Path
if 'google.colab' not in sys.modules:
    path_data = Path.home() / 'github/Recommender_System/data/goodbooks_10k'
    books = pd.read_csv(path_data / 'books.csv').head(1000)
    ratings = pd.read_csv(path_data / 'ratings.csv').head(5000)

print(books.shape)
print(ratings.shape)

display(books.head(2))
display(ratings.head(2))
(1000, 23)
(5000, 3)
book_id goodreads_book_id best_book_id work_id books_count isbn isbn13 authors original_publication_year original_title ... ratings_count work_ratings_count work_text_reviews_count ratings_1 ratings_2 ratings_3 ratings_4 ratings_5 image_url small_image_url
0 1 2767052 2767052 2792775 272 439023483 9.780439e+12 Suzanne Collins 2008.0 The Hunger Games ... 4780653 4942365 155254 66715 127936 560092 1481305 2706317 https://images.gr-assets.com/books/1447303603m... https://images.gr-assets.com/books/1447303603s...
1 2 3 3 4640799 491 439554934 9.780440e+12 J.K. Rowling, Mary GrandPré 1997.0 Harry Potter and the Philosopher's Stone ... 4602479 4800065 75867 75504 101676 455024 1156318 3011543 https://images.gr-assets.com/books/1474154022m... https://images.gr-assets.com/books/1474154022s...

2 rows × 23 columns

user_id book_id rating
0 1 258 5
1 2 4081 4
In [5]:
print(books.shape)
print(books.columns)
books.head(2)
(1000, 23)
Index(['book_id', 'goodreads_book_id', 'best_book_id', 'work_id',
       'books_count', 'isbn', 'isbn13', 'authors', 'original_publication_year',
       'original_title', 'title', 'language_code', 'average_rating',
       'ratings_count', 'work_ratings_count', 'work_text_reviews_count',
       'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5',
       'image_url', 'small_image_url'],
      dtype='object')
Out[5]:
book_id goodreads_book_id best_book_id work_id books_count isbn isbn13 authors original_publication_year original_title ... ratings_count work_ratings_count work_text_reviews_count ratings_1 ratings_2 ratings_3 ratings_4 ratings_5 image_url small_image_url
0 1 2767052 2767052 2792775 272 439023483 9.780439e+12 Suzanne Collins 2008.0 The Hunger Games ... 4780653 4942365 155254 66715 127936 560092 1481305 2706317 https://images.gr-assets.com/books/1447303603m... https://images.gr-assets.com/books/1447303603s...
1 2 3 3 4640799 491 439554934 9.780440e+12 J.K. Rowling, Mary GrandPré 1997.0 Harry Potter and the Philosopher's Stone ... 4602479 4800065 75867 75504 101676 455024 1156318 3011543 https://images.gr-assets.com/books/1474154022m... https://images.gr-assets.com/books/1474154022s...

2 rows × 23 columns

In [6]:
books_cols = ['book_id', 'authors', 'original_publication_year', 'title', 'average_rating']
books2 = books[books_cols]
books2.head(2)
Out[6]:
book_id authors original_publication_year title average_rating
0 1 Suzanne Collins 2008.0 The Hunger Games (The Hunger Games, #1) 4.34
1 2 J.K. Rowling, Mary GrandPré 1997.0 Harry Potter and the Sorcerer's Stone (Harry P... 4.44
In [7]:
print(ratings.shape)
print(ratings.columns)
ratings.head(2)
(5000, 3)
Index(['user_id', 'book_id', 'rating'], dtype='object')
Out[7]:
user_id book_id rating
0 1 258 5
1 2 4081 4
In [8]:
df = pd.merge(books, ratings, on="book_id", how="inner")
print(df.shape)
df.head(2)
(3031, 25)
Out[8]:
book_id goodreads_book_id best_book_id work_id books_count isbn isbn13 authors original_publication_year original_title ... work_text_reviews_count ratings_1 ratings_2 ratings_3 ratings_4 ratings_5 image_url small_image_url user_id rating
0 2 3 3 4640799 491 439554934 9.780440e+12 J.K. Rowling, Mary GrandPré 1997.0 Harry Potter and the Philosopher's Stone ... 75867 75504 101676 455024 1156318 3011543 https://images.gr-assets.com/books/1474154022m... https://images.gr-assets.com/books/1474154022s... 4 5
1 2 3 3 4640799 491 439554934 9.780440e+12 J.K. Rowling, Mary GrandPré 1997.0 Harry Potter and the Philosopher's Stone ... 75867 75504 101676 455024 1156318 3011543 https://images.gr-assets.com/books/1474154022m... https://images.gr-assets.com/books/1474154022s... 15 4

2 rows × 25 columns

5. Knowledge-Based Recommender

Concept: Uses explicit knowledge about items and user preferences.

In [9]:
def knowledge_based_recommender(user_id, ratings_df, books_df, n_recommendations=5):
    # Get user's preferred features (e.g., from past ratings)
    user_books = ratings_df[ratings_df['user_id'] == user_id]
    user_books = user_books.merge(books_df, left_on='book_id', right_on='book_id')

    # Extract preferences (example: top authors)
    if not user_books.empty:
        top_authors = user_books['authors'].value_counts().index[0]
        top_year = user_books['original_publication_year'].mode()[0]
    else:
        # Default preferences for cold start
        top_authors = "J.K. Rowling"
        top_year = 2000

    # Filter books based on knowledge rules
    recommendations = books_df[
        (books_df['authors'].str.contains(top_authors)) |
        (abs(books_df['original_publication_year'] - top_year) <= 5)
    ]

    # Sort by popularity/rating
    avg_ratings = ratings_df.groupby('book_id')['rating'].mean()
    recommendations = recommendations.merge(avg_ratings, left_on='book_id', right_index=True)

    # Remove already rated
    rated_books = ratings_df[ratings_df['user_id'] == user_id]['book_id']
    recommendations = recommendations[~recommendations['book_id'].isin(rated_books)]

    return recommendations.sort_values('rating', ascending=False).head(n_recommendations)

# Usage
user_id = ratings['user_id'].iloc[0]
df_out = knowledge_based_recommender(user_id, ratings, books)
df_out
Out[9]:
book_id goodreads_book_id best_book_id work_id books_count isbn isbn13 authors original_publication_year original_title ... work_ratings_count work_text_reviews_count ratings_1 ratings_2 ratings_3 ratings_4 ratings_5 image_url small_image_url rating
479 480 18122 18122 1774510 42 440238153 9.780440e+12 Philip Pullman 2000.0 The Amber Spyglass ... 217610 6693 4739 11883 39131 70648 91209 https://images.gr-assets.com/books/1329189152m... https://images.gr-assets.com/books/1329189152s... 5.0
238 239 8908 8908 817 137 307346609 9.780307e+12 Max Brooks 2006.0 World War Z: An Oral History of the Zombie War ... 345149 22412 7680 19887 65570 121537 130475 https://images.gr-assets.com/books/1386328204m... https://images.gr-assets.com/books/1386328204s... 5.0
406 407 5826 5826 859342 77 60838728 9.780061e+12 Ann Patchett 2001.0 Bel Canto ... 200256 12134 4523 13395 43193 71190 67955 https://images.gr-assets.com/books/1352997328m... https://images.gr-assets.com/books/1352997328s... 5.0
366 367 14995 14995 4574034 82 553384287 9.780553e+12 Dean Koontz 2003.0 Odd Thomas ... 213895 6357 10963 12869 39433 65984 84646 https://s.gr-assets.com/assets/nophoto/book/11... https://s.gr-assets.com/assets/nophoto/book/50... 5.0
608 609 10567 10567 3017730 123 1416524517 9.781417e+12 Stephen King 2006.0 Cell ... 153150 4841 5563 16309 44687 47447 39144 https://s.gr-assets.com/assets/nophoto/book/11... https://s.gr-assets.com/assets/nophoto/book/50... 5.0

5 rows × 24 columns

In [ ]: