Music Recommendation System¶

Varun Nadgir
Applied Data Science Program 2021

import warnings
warnings.filterwarnings('ignore')

# numpy, pandas for data transformation and df manipulation
import numpy as np
import pandas as pd

# matplotlib, seaborn for visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# cosine similarity calculation
from sklearn.metrics.pairwise import cosine_similarity

# splitting data into train and test sets
from sklearn.model_selection import train_test_split

# RMSE and MSE success measures to judge model
from sklearn.metrics import mean_squared_error

# import datasets
song_df = pd.read_csv('song_data.csv')
count_df = pd.read_csv('count_data.csv')

# dropping redundant index column
count_df = count_df.drop('Unnamed: 0', axis=1)

1. Exploring the Data¶

# quick look at the df
song_df.head()

# quick look at the df
count_df.head()

# check for missing values
song_df.isna().sum()

song_id         0
title          15
release         5
artist_name     0
year            0
dtype: int64

# check for missing values
count_df.isna().sum()

user_id       0
song_id       0
play_count    0
dtype: int64

In song_df, Title and Release columns have some null values.
All values are present in count_df.

# check how many song id are unique
print("song_df unique id count: ", song_df['song_id'].nunique())
print("count_df unique id count:", count_df['song_id'].nunique())

song_df unique id count:  999056
count_df unique id count: 10000

# check sizes
song_rows, song_columns = song_df.shape
print("No of rows: ", song_rows) 
print("No of columns: ", song_columns) 

count_rows, count_columns = count_df.shape
print("No of rows: ", count_rows) 
print("No of columns: ", count_columns)

No of rows:  1000000
No of columns:  5
No of rows:  2000000
No of columns:  3

Out of 1,000,000 rows in song_df, only 999,056 song id are unique. This means that 944 songs in song_df have a duplicated id.

# histogram of play counts
plt.hist(count_df['play_count'], range=(0,20))
plt.title("Play Count")
plt.show()

# what does upper end look like
count_df.sort_values(by='play_count', ascending=False).head(10)

Though we have some very high play counts in our dataset, the majority of plays are less than 5.
May need to shave off the lower end to save on computing.
Higher number of plays also means the user is more active than someone with just 1 or 2 plays.

2. Cleaning the Data¶

# drop duplicated song ids, remove year = 0 obs
song_df = song_df.drop_duplicates(subset=['title', 'artist_name'], keep=False)
song_df = song_df[song_df['year'] != 0]

song_df.head()

count_df.head()

To make our data easier to work with, I will re-index the song and user id columns. By taking their unique values, I can instead map them to integer values 0 - n where n is the number of unique ids for that column. This will avoid having to use long alphanumeric strings to find users and songs.

# combine all song ids into one list
all_songs = song_df['song_id'].append(count_df['song_id'])

# take only unique ids
song_id_list = list(np.unique(all_songs))
user_id_list = list(np.unique(count_df['user_id']))

# create a dictionary of "original song id" : "new id"
song_id_dict = {}
song_count = 0

for songid in song_id_list:
    song_id_dict[songid] = song_count
    song_count += 1

# create a dictionary of "original user id" : "new id"
user_id_dict = {}
user_count = 0

for userid in user_id_list:
    user_id_dict[userid] = user_count
    user_count += 1

# run this to map original song ids to new ids based on previous dictionaries
song_df['song_id'] = song_df['song_id'].map(song_id_dict)
count_df['song_id'] = count_df['song_id'].map(song_id_dict)
count_df['user_id'] = count_df['user_id'].map(user_id_dict)

song_df.head()

count_df.head()

# check if any ids failed to convert (failed .map leads to NaN value)
count_df.isna().sum()

user_id       0
song_id       0
play_count    0
dtype: int64

song_df.isna().sum()

song_id        0
title          0
release        0
artist_name    0
year           0
dtype: int64

Now both dataframes have had song and user id columns re-mapped.
There are no null values after mapping, meaning replacements were successful.

3. Collaborative Filtering¶

# take subset of songs with at least 15 plays
# will help reduce processing load and will yield better results for long-term users
count_df_15 = count_df[count_df['play_count']>15]

# use pivot method to turn row-wise observations into user/item sparse matrix 
interactions_matrix = count_df_15.pivot(index='user_id', columns='song_id', values='play_count')

# replace NaN with 0 to allow for calculations
interactions_matrix.fillna(0, inplace=True)
interactions_matrix.head()

# function to input user id to find most similar users
def similar_users(user_id, interactions_matrix):
    similarity = [] 
    for user in interactions_matrix.index:
        
        # finding cosine similarity between the given user_id and each other user
        sim = cosine_similarity([interactions_matrix.loc[user_id]], [interactions_matrix.loc[user]])
        
        # appending the user and similarity score to new list as a tuple 
        similarity.append((user, sim))
     
    # sort the list
    similarity.sort(key=lambda x: x[1], reverse=True)  
    
    # find the user from each tuple in the sorted list
    most_similar_users = [tup[0] for tup in similarity] 
    
    # find the similarity score from each tuple in the sorted list
    similarity_score = [tup[1] for tup in similarity]   
    
    # remove the original user to display only other users
    most_similar_users.remove(user_id)                        
    similarity_score.remove(similarity_score[0])
       
    return most_similar_users, similarity_score

# top 10 most similar users to given id based on cosine similarity
similar_users(4395, interactions_matrix)[0][:10]

[32884, 2951, 6884, 55903, 61446, 22723, 13585, 63444, 28679, 36831]

# top 10 most similar users' similarity rating
similar_users(4395, interactions_matrix)[1][:10]

[array([[1.]]),
 array([[0.90867684]]),
 array([[0.84173237]]),
 array([[0.83844362]]),
 array([[0.77785414]]),
 array([[0.74456944]]),
 array([[0.73994007]]),
 array([[0.64764842]]),
 array([[0.64014122]]),
 array([[0.62691567]])]

# function to find similar users and what they have listened to 
def recommendations(user_id, num_songs, interactions_matrix):
    
    # save similar users using similar_users
    most_similar_users = similar_users(user_id, interactions_matrix)[0]  
    
    # find song id with which the user has interacted
    song_ids = set(list(interactions_matrix.columns[np.where(interactions_matrix.loc[user_id] > 0)]))
    recommendations = []
    
    observed_interactions = song_ids.copy()
    for similar_user in most_similar_users:
        if len(recommendations) < num_songs:
            # find n songs which have been rated by similar users but not by given user
            similar_user_song_ids = set(list(interactions_matrix.columns[np.where(interactions_matrix.loc[similar_user] > 0)]))
            recommendations.extend(list(similar_user_song_ids.difference(observed_interactions)))
            observed_interactions = observed_interactions.union(similar_user_song_ids)
        else:
            break
    
    return recommendations[:num_songs]

# get song ids from recommendations function
song_recs = recommendations(4395, 10, interactions_matrix)

# display top 10 recommended songs for the given user
song_df[song_df['song_id'].isin(song_recs)]

4. Matrix Factorization¶

# get SVD matrices
u, s, vt = np.linalg.svd(interactions_matrix, full_matrices=False)

# 19103 users, 7887 latent features
u.shape

(19103, 7887)

# 7887 latent features
s.shape

(7887,)

# 7887 latent features, 7887 items
vt.shape

(7887, 7887)

# split count_df into train and test
x_train, x_test = train_test_split(count_df_15, test_size=0.2, random_state=42)

# 80% of the data
x_train.shape

(40996, 3)

# 20% of the data
x_test.shape

(10249, 3)

# train interaction matrix
interactions_matrix_train = x_train.pivot(index='user_id', columns='song_id', values='play_count')
interactions_matrix_train.fillna(0, inplace=True)

# test interaction matrix
interactions_matrix_test = x_test.pivot(index='user_id', columns='song_id', values='play_count')
interactions_matrix_test.fillna(0, inplace=True)

# find unique users in train and test data and then take their intersection i.e. common users in train and test data 
train_idx = set(interactions_matrix_train.index)
test_idx = set(interactions_matrix_test.index)
match_idx = train_idx.intersection(test_idx)
    
# find unique songs in train and test data and then take their intersection i.e. common songs in train and test data 
train_songs = set(interactions_matrix_train.columns)
test_songs = set(interactions_matrix_test.columns)
match_cols = train_songs.intersection(test_songs)

#Selecting only common users and movies from the test interaction matrix
interactions_matrix_test = interactions_matrix_test.loc[match_idx, match_cols]

# break down train matrix with SVD
u_train, s_train, vt_train = np.linalg.svd(interactions_matrix_train, full_matrices=False)

# finding u_test and vt_test matrices using u_train, vt_train and common user/songs in train and test data
row_idxs = interactions_matrix_train.index.isin(test_idx)
col_idxs = interactions_matrix_train.columns.isin(test_songs)
u_test = u_train[row_idxs, :]
vt_test = vt_train[:, col_idxs]

# create array for number of latent features and empty lists to store train and test errors
latent_features = np.arange(0, 700, 20) 
train_error = []
test_error = []

for k in latent_features:
    # slice the U, S, and Vt matrices to get k latent features from train and test data 
    s_train_lat, u_train_lat, vt_train_lat = np.diag(s_train[:k]), u_train[:, :k], vt_train[:k, :]
    u_test_lat, vt_test_lat = u_test[:, :k], vt_test[:k, :]
    
    # regenerate train and test interaction matrices using k latent features
    interactions_matrix_train_preds = np.around(np.matmul(np.matmul(u_train_lat, s_train_lat), vt_train_lat))
    interactions_matrix_test_preds = np.around(np.matmul(np.matmul(u_test_lat, s_train_lat), vt_test_lat))
    
    # calculate the actual and predicted average rating for each song in the training data
    avg_rating_train = interactions_matrix_train.mean(axis=0)
    avg_rating_train_pred = interactions_matrix_train_preds.mean(axis=0)
    
    # calculate the actual and predicted average rating for each song in the test data
    avg_rating_test = interactions_matrix_test.mean(axis=0)
    avg_rating_test_pred = interactions_matrix_test_preds.mean(axis=0)
    
    # calculate train and test RMSE
    train_rmse = mean_squared_error(avg_rating_train, avg_rating_train_pred, squared=False)
    test_rmse = mean_squared_error(avg_rating_test, avg_rating_test_pred, squared=False)
    
    train_error.append(train_rmse)
    test_error.append(test_rmse)
    
# plotting train and test RMSE
plt.figure(figsize=(10,7))
plt.title('Latent Feature Model Comparison')
plt.plot(latent_features, train_error, label='Train', marker='o');
plt.plot(latent_features, test_error, label='Test', marker='o');
plt.xlabel('Number of Latent Features');
plt.ylabel('RMSE');
plt.legend();

# latent features 
plt.show()

# rerun model with set number of latent features = 10
s_10, u_10, vt_10 = np.diag(s[:10]), u[:, :10], vt[:10, :]
song_predicted_plays_10 = np.around(np.matmul(np.matmul(u_10, s_10), vt_10))
song_predicted_plays_10 = pd.DataFrame(abs(song_predicted_plays_10), columns = interactions_matrix.columns, index = interactions_matrix.index)
song_predicted_plays_10.head()

# rerun model with set number of latent features = 20
s_20, u_20, vt_20 = np.diag(s[:20]), u[:, :20], vt[:20, :]
song_predicted_plays_20 = np.around(np.matmul(np.matmul(u_20, s_20), vt_20))
song_predicted_plays_20 = pd.DataFrame(abs(song_predicted_plays_20), columns = interactions_matrix.columns, index = interactions_matrix.index)
song_predicted_plays_20.head()

# rerun model with set number of latent features = 40
s_40, u_40, vt_40 = np.diag(s[:40]), u[:, :40], vt[:40, :]
song_predicted_plays_40 = np.around(np.matmul(np.matmul(u_40, s_40), vt_40))
song_predicted_plays_40 = pd.DataFrame(abs(song_predicted_plays_40), columns = interactions_matrix.columns, index = interactions_matrix.index)
song_predicted_plays_40.head()

# recommend the songs with the highest predicted plays
def recommend_songs(user_idx, interactions_matrix, preds_df, num_recommendations):
          
    # Get and sort the user's ratings from the actual and predicted interaction matrix
    sorted_user_plays = interactions_matrix.iloc[user_idx].sort_values(ascending=False)
    sorted_user_predictions = preds_df.iloc[user_idx].sort_values(ascending=False)
    
    #Creating a dataframe with actual and predicted ratings columns
    temp = pd.concat([sorted_user_plays, sorted_user_predictions], axis=1)
    temp.index.name = 'Recommended Songs'
    temp.columns = ['user_plays', 'user_predictions']
    
    #Filtering the dataframe where actual ratings are 0 which implies that the user has not interacted with that movie
    temp = temp.loc[temp.user_plays == 0]   
    
    #Recommending movies with top predicted ratings
    temp = temp.sort_values('user_predictions', ascending=False)
    print('\nBelow are the recommended songs for user(user_id = {}):\n'.format(user_idx))
    print(temp['user_predictions'].head(num_recommendations))

# return predicted number of plays, sorted by highest 
recommend_songs(3125, interactions_matrix, song_predicted_plays_10, 5)

Below are the recommended songs for user(user_id = 3125):

Recommended Songs
365178    2.0
329904    1.0
15080     1.0
350728    1.0
297789    0.0
Name: user_predictions, dtype: float64

# display song information based on song id
song_df[song_df['song_id'].isin([365178, 329904, 15080, 350728, 297789])]

# create a dataframe containing average actual plays and avearge predicted plays of the songs
rmse_df = pd.concat([interactions_matrix.mean(), song_predicted_plays_10.mean(), 
                     song_predicted_plays_20.mean(), song_predicted_plays_40.mean()], axis=1)
rmse_df.columns = ['avg_actual_plays', 'avg_predicted_plays_10', 'avg_predicted_plays_20', 'avg_predicted_plays_40']
rmse_df.head()

# low RMSE means predictions are close to actual number of plays
RMSE_10 = mean_squared_error(rmse_df['avg_actual_plays'], rmse_df['avg_predicted_plays_10'], squared=False)
RMSE_20 = mean_squared_error(rmse_df['avg_actual_plays'], rmse_df['avg_predicted_plays_20'], squared=False)
RMSE_40 = mean_squared_error(rmse_df['avg_actual_plays'], rmse_df['avg_predicted_plays_40'], squared=False)

print('\nRMSE SVD Model: 10 Latent Features = {} \n'.format(RMSE_10))
print('\nRMSE SVD Model: 20 Latent Features = {} \n'.format(RMSE_20))
print('\nRMSE SVD Model: 40 Latent Features = {} \n'.format(RMSE_40))

RMSE SVD Model: 10 Latent Features = 0.019779776303072743 


RMSE SVD Model: 20 Latent Features = 0.017045863739923076 


RMSE SVD Model: 40 Latent Features = 0.014384358016931222

5. Full Implementation: Combing Methods¶

# recommend the songs with the highest predicted plays
def recommend_songs_2(user_idx, interactions_matrix, preds_df, num_recommendations):
          
    # get and sort the user's ratings from the actual and predicted interaction matrix
    sorted_user_plays = interactions_matrix.iloc[user_idx].sort_values(ascending=False)
    sorted_user_predictions = preds_df.iloc[user_idx].sort_values(ascending=False)
    
    # create a dataframe with actual and predicted plays columns
    temp = pd.concat([sorted_user_plays, sorted_user_predictions], axis=1)
    temp.index.name = 'Recommended Songs'
    temp.columns = ['user_plays', 'user_predictions']
    
    # filter the dataframe where actual ratings are 0 which implies that the user has not interacted with that song
    temp = temp.loc[temp.user_plays == 0]   
    
    # recommend songs with top predicted plays
    temp = temp.sort_values('user_predictions', ascending=False)
    # modified return value so that it can be used as an object later
    return temp.index[:num_recommendations]

def hybrid_recs(user_id, num_recommendations):
    # use collaborative filtering for first set of recs
    cf_recs = recommendations(user_id, num_recommendations, interactions_matrix)

    # use matrix estimation for second set of recs
    svd_recs = list(recommend_songs_2(user_id, interactions_matrix, song_predicted_plays_40, num_recommendations))

    # combine all recs and keep unique ids
    all_recs = np.unique(cf_recs+svd_recs)
    
    # display combined recommended songs for the given user
    return song_df[song_df['song_id'].isin(all_recs)]

hybrid_recs(19, 10)

	song_id	title	release	artist_name	year
0	SOQMMHC12AB0180CB8	Silent Night	Monster Ballads X-Mas	Faster Pussy cat	2003
1	SOVFVAK12A8C1350D9	Tanssi vaan	Karkuteillä	Karkkiautomaatti	1995
2	SOGTUKN12AB017F4F1	No One Could Ever	Butter	Hudson Mohawke	2006
3	SOBNYVR12A8C13558C	Si Vos Querés	De Culo	Yerba Brava	2003
4	SOHSBXH12A8C13B0DF	Tangle Of Aspens	Rene Ablaze Presents Winter Sessions	Der Mystic	0

	user_id	song_id	play_count
0	b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOAKIMP12A8C130995	1
1	b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOBBMDR12A8C13253B	2
2	b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOBXHDL12A81C204C0	1
3	b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOBYHAJ12A6701BF1D	1
4	b80344d063b5ccb3212f76538f3d9e43d87dca9e	SODACBL12A8C13C273	1

	user_id	song_id	play_count
1228366	d13609d62db6df876d3cc388225478618bb7b912	SOFCGSE12AF72A674F	2213
1048310	50996bbabb6f7857bf0c8019435b5246a0e45cfd	SOUAGPQ12A8AE47B3A	920
1586780	5ea608df0357ec4fda191cb9316fe8e6e65e3777	SOKOSPK12A8C13C088	879
31179	bb85bb79612e5373ac714fcd4469cabeb5ed94e1	SOZQSVB12A8C13C271	796
1875121	c012ec364329bb08cbe3e62fe76db31f8c5d8ec3	SOBONKR12A58A7A7E0	683
1644909	70caceccaa745b6f7bc2898a154538eb1ada4d5a	SOPREHY12AB01815F9	676
1731945	972cce803aa7beceaa7d0039e4c7c0ff097e4d55	SOJRFWQ12AB0183582	664
1374693	d2232ac7a1ec17b283b5dff243161902b2cb706c	SOLGIWB12A58A77A05	649
1819571	f5363481018dc87e8b06f9451e99804610a594fa	SOVRIPE12A6D4FEA19	605
515442	f1bdbb9fb7399b402a09fa124210dedf78e76034	SOZPMJT12AAF3B40D1	585

	song_id	title	release	artist_name	year
0	SOQMMHC12AB0180CB8	Silent Night	Monster Ballads X-Mas	Faster Pussy cat	2003
1	SOVFVAK12A8C1350D9	Tanssi vaan	Karkuteillä	Karkkiautomaatti	1995
2	SOGTUKN12AB017F4F1	No One Could Ever	Butter	Hudson Mohawke	2006
3	SOBNYVR12A8C13558C	Si Vos Querés	De Culo	Yerba Brava	2003
7	SOEYRFT12AB018936C	2 Da Beat Ch'yall	Da Bomb	Kris Kross	1993

	user_id	song_id	play_count
0	b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOAKIMP12A8C130995	1
1	b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOBBMDR12A8C13253B	2
2	b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOBXHDL12A81C204C0	1
3	b80344d063b5ccb3212f76538f3d9e43d87dca9e	SOBYHAJ12A6701BF1D	1
4	b80344d063b5ccb3212f76538f3d9e43d87dca9e	SODACBL12A8C13C273	1

	song_id	title	release	artist_name	year
0	291199	Silent Night	Monster Ballads X-Mas	Faster Pussy cat	2003
1	366435	Tanssi vaan	Karkuteillä	Karkkiautomaatti	1995
2	125131	No One Could Ever	Butter	Hudson Mohawke	2006
3	28698	Si Vos Querés	De Culo	Yerba Brava	2003
7	92317	2 Da Beat Ch'yall	Da Bomb	Kris Kross	1993

	song_id	title	release	artist_name	year
22222	162825	The Absence Of God (Album Version)	More Adventurous	Rilo Kiley	2004
247773	229961	Oops!...I Did It Again	The Singles Collection	Britney Spears	2000
366741	96266	Oxford Comma (Album)	Vampire Weekend	Vampire Weekend	2007
384398	70493	(You Drive Me) Crazy (The Stop Remix!)	The Singles Collection	Britney Spears	1999
477539	283585	Every Lasting Light	Brothers	The Black Keys	2010
565391	204061	Sugar Ray (LP Version)	Falling Off The Bone	Todd Barry	2004
828866	365024	Tighten Up	Tighten Up	The Black Keys	2010
864571	243409	I Kissed A Girl	One Of The Boys (iTunes Exclusive)	Katy Perry	2008

	song_id	title	release	artist_name	year
111102	15080	Undo	Vespertine Live	Björk	2001
419297	350728	Invalid	Fermi Paradox	Tub Ring	2002
865618	329904	Revelry	Only By The Night	Kings Of Leon	2008
984786	297789	Waiting For A Dream	Want	Rufus Wainwright	2004

	avg_actual_plays	avg_predicted_plays_10	avg_predicted_plays_20	avg_predicted_plays_40
song_id
6	0.004607	0.000000	0.000000	0.000105
64	0.002199	0.000000	0.000000	0.000052
66	0.001361	0.000000	0.000000	0.000000
146	0.003350	0.000000	0.000000	0.000000
174	0.016542	0.000052	0.000157	0.000209

	user_id	song_id	play_count
0	54961	7370	1
1	54961	19737	2
2	54961	35497	1
3	54961	36241	1
4	54961	56150	1

	song_id	title	release	artist_name	year
111102	15080	Undo	Vespertine Live	Björk	2001
123276	409081	Masquerade (Reality Check Album Version)	Reality Check	Reality Check	1997
363835	179016	Up	My Worlds	Justin Bieber	2010
611449	218691	Le Jardin d'Hiver	Smile	Jacky Terrasson	2002
921367	380325	Ave Maria	I AM...SASHA FIERCE	Beyoncé	2008
930877	345146	Marry Me	Save Me_ San Francisco	Train	2009