Varun Nadgir
Applied Data Science Program 2021
import warnings
warnings.filterwarnings('ignore')
# numpy, pandas for data transformation and df manipulation
import numpy as np
import pandas as pd
# matplotlib, seaborn for visualizations
import matplotlib.pyplot as plt
import seaborn as sns
# cosine similarity calculation
from sklearn.metrics.pairwise import cosine_similarity
# splitting data into train and test sets
from sklearn.model_selection import train_test_split
# RMSE and MSE success measures to judge model
from sklearn.metrics import mean_squared_error
# import datasets
song_df = pd.read_csv('song_data.csv')
count_df = pd.read_csv('count_data.csv')
# dropping redundant index column
count_df = count_df.drop('Unnamed: 0', axis=1)
# quick look at the df
song_df.head()
# quick look at the df
count_df.head()
# check for missing values
song_df.isna().sum()
# check for missing values
count_df.isna().sum()
# check how many song id are unique
print("song_df unique id count: ", song_df['song_id'].nunique())
print("count_df unique id count:", count_df['song_id'].nunique())
# check sizes
song_rows, song_columns = song_df.shape
print("No of rows: ", song_rows)
print("No of columns: ", song_columns)
count_rows, count_columns = count_df.shape
print("No of rows: ", count_rows)
print("No of columns: ", count_columns)
# histogram of play counts
plt.hist(count_df['play_count'], range=(0,20))
plt.title("Play Count")
plt.show()
# what does upper end look like
count_df.sort_values(by='play_count', ascending=False).head(10)
# drop duplicated song ids, remove year = 0 obs
song_df = song_df.drop_duplicates(subset=['title', 'artist_name'], keep=False)
song_df = song_df[song_df['year'] != 0]
song_df.head()
count_df.head()
To make our data easier to work with, I will re-index the song and user id columns. By taking their unique values, I can instead map them to integer values 0 - n where n is the number of unique ids for that column. This will avoid having to use long alphanumeric strings to find users and songs.
# combine all song ids into one list
all_songs = song_df['song_id'].append(count_df['song_id'])
# take only unique ids
song_id_list = list(np.unique(all_songs))
user_id_list = list(np.unique(count_df['user_id']))
# create a dictionary of "original song id" : "new id"
song_id_dict = {}
song_count = 0
for songid in song_id_list:
song_id_dict[songid] = song_count
song_count += 1
# create a dictionary of "original user id" : "new id"
user_id_dict = {}
user_count = 0
for userid in user_id_list:
user_id_dict[userid] = user_count
user_count += 1
# run this to map original song ids to new ids based on previous dictionaries
song_df['song_id'] = song_df['song_id'].map(song_id_dict)
count_df['song_id'] = count_df['song_id'].map(song_id_dict)
count_df['user_id'] = count_df['user_id'].map(user_id_dict)
song_df.head()
count_df.head()
# check if any ids failed to convert (failed .map leads to NaN value)
count_df.isna().sum()
song_df.isna().sum()
# take subset of songs with at least 15 plays
# will help reduce processing load and will yield better results for long-term users
count_df_15 = count_df[count_df['play_count']>15]
# use pivot method to turn row-wise observations into user/item sparse matrix
interactions_matrix = count_df_15.pivot(index='user_id', columns='song_id', values='play_count')
# replace NaN with 0 to allow for calculations
interactions_matrix.fillna(0, inplace=True)
interactions_matrix.head()
# function to input user id to find most similar users
def similar_users(user_id, interactions_matrix):
similarity = []
for user in interactions_matrix.index:
# finding cosine similarity between the given user_id and each other user
sim = cosine_similarity([interactions_matrix.loc[user_id]], [interactions_matrix.loc[user]])
# appending the user and similarity score to new list as a tuple
similarity.append((user, sim))
# sort the list
similarity.sort(key=lambda x: x[1], reverse=True)
# find the user from each tuple in the sorted list
most_similar_users = [tup[0] for tup in similarity]
# find the similarity score from each tuple in the sorted list
similarity_score = [tup[1] for tup in similarity]
# remove the original user to display only other users
most_similar_users.remove(user_id)
similarity_score.remove(similarity_score[0])
return most_similar_users, similarity_score
# top 10 most similar users to given id based on cosine similarity
similar_users(4395, interactions_matrix)[0][:10]
# top 10 most similar users' similarity rating
similar_users(4395, interactions_matrix)[1][:10]
# function to find similar users and what they have listened to
def recommendations(user_id, num_songs, interactions_matrix):
# save similar users using similar_users
most_similar_users = similar_users(user_id, interactions_matrix)[0]
# find song id with which the user has interacted
song_ids = set(list(interactions_matrix.columns[np.where(interactions_matrix.loc[user_id] > 0)]))
recommendations = []
observed_interactions = song_ids.copy()
for similar_user in most_similar_users:
if len(recommendations) < num_songs:
# find n songs which have been rated by similar users but not by given user
similar_user_song_ids = set(list(interactions_matrix.columns[np.where(interactions_matrix.loc[similar_user] > 0)]))
recommendations.extend(list(similar_user_song_ids.difference(observed_interactions)))
observed_interactions = observed_interactions.union(similar_user_song_ids)
else:
break
return recommendations[:num_songs]
# get song ids from recommendations function
song_recs = recommendations(4395, 10, interactions_matrix)
# display top 10 recommended songs for the given user
song_df[song_df['song_id'].isin(song_recs)]
# get SVD matrices
u, s, vt = np.linalg.svd(interactions_matrix, full_matrices=False)
# 19103 users, 7887 latent features
u.shape
# 7887 latent features
s.shape
# 7887 latent features, 7887 items
vt.shape
# split count_df into train and test
x_train, x_test = train_test_split(count_df_15, test_size=0.2, random_state=42)
# 80% of the data
x_train.shape
# 20% of the data
x_test.shape
# train interaction matrix
interactions_matrix_train = x_train.pivot(index='user_id', columns='song_id', values='play_count')
interactions_matrix_train.fillna(0, inplace=True)
# test interaction matrix
interactions_matrix_test = x_test.pivot(index='user_id', columns='song_id', values='play_count')
interactions_matrix_test.fillna(0, inplace=True)
# find unique users in train and test data and then take their intersection i.e. common users in train and test data
train_idx = set(interactions_matrix_train.index)
test_idx = set(interactions_matrix_test.index)
match_idx = train_idx.intersection(test_idx)
# find unique songs in train and test data and then take their intersection i.e. common songs in train and test data
train_songs = set(interactions_matrix_train.columns)
test_songs = set(interactions_matrix_test.columns)
match_cols = train_songs.intersection(test_songs)
#Selecting only common users and movies from the test interaction matrix
interactions_matrix_test = interactions_matrix_test.loc[match_idx, match_cols]
# break down train matrix with SVD
u_train, s_train, vt_train = np.linalg.svd(interactions_matrix_train, full_matrices=False)
# finding u_test and vt_test matrices using u_train, vt_train and common user/songs in train and test data
row_idxs = interactions_matrix_train.index.isin(test_idx)
col_idxs = interactions_matrix_train.columns.isin(test_songs)
u_test = u_train[row_idxs, :]
vt_test = vt_train[:, col_idxs]
# create array for number of latent features and empty lists to store train and test errors
latent_features = np.arange(0, 700, 20)
train_error = []
test_error = []
for k in latent_features:
# slice the U, S, and Vt matrices to get k latent features from train and test data
s_train_lat, u_train_lat, vt_train_lat = np.diag(s_train[:k]), u_train[:, :k], vt_train[:k, :]
u_test_lat, vt_test_lat = u_test[:, :k], vt_test[:k, :]
# regenerate train and test interaction matrices using k latent features
interactions_matrix_train_preds = np.around(np.matmul(np.matmul(u_train_lat, s_train_lat), vt_train_lat))
interactions_matrix_test_preds = np.around(np.matmul(np.matmul(u_test_lat, s_train_lat), vt_test_lat))
# calculate the actual and predicted average rating for each song in the training data
avg_rating_train = interactions_matrix_train.mean(axis=0)
avg_rating_train_pred = interactions_matrix_train_preds.mean(axis=0)
# calculate the actual and predicted average rating for each song in the test data
avg_rating_test = interactions_matrix_test.mean(axis=0)
avg_rating_test_pred = interactions_matrix_test_preds.mean(axis=0)
# calculate train and test RMSE
train_rmse = mean_squared_error(avg_rating_train, avg_rating_train_pred, squared=False)
test_rmse = mean_squared_error(avg_rating_test, avg_rating_test_pred, squared=False)
train_error.append(train_rmse)
test_error.append(test_rmse)
# plotting train and test RMSE
plt.figure(figsize=(10,7))
plt.title('Latent Feature Model Comparison')
plt.plot(latent_features, train_error, label='Train', marker='o');
plt.plot(latent_features, test_error, label='Test', marker='o');
plt.xlabel('Number of Latent Features');
plt.ylabel('RMSE');
plt.legend();
# latent features
plt.show()
# rerun model with set number of latent features = 10
s_10, u_10, vt_10 = np.diag(s[:10]), u[:, :10], vt[:10, :]
song_predicted_plays_10 = np.around(np.matmul(np.matmul(u_10, s_10), vt_10))
song_predicted_plays_10 = pd.DataFrame(abs(song_predicted_plays_10), columns = interactions_matrix.columns, index = interactions_matrix.index)
song_predicted_plays_10.head()
# rerun model with set number of latent features = 20
s_20, u_20, vt_20 = np.diag(s[:20]), u[:, :20], vt[:20, :]
song_predicted_plays_20 = np.around(np.matmul(np.matmul(u_20, s_20), vt_20))
song_predicted_plays_20 = pd.DataFrame(abs(song_predicted_plays_20), columns = interactions_matrix.columns, index = interactions_matrix.index)
song_predicted_plays_20.head()
# rerun model with set number of latent features = 40
s_40, u_40, vt_40 = np.diag(s[:40]), u[:, :40], vt[:40, :]
song_predicted_plays_40 = np.around(np.matmul(np.matmul(u_40, s_40), vt_40))
song_predicted_plays_40 = pd.DataFrame(abs(song_predicted_plays_40), columns = interactions_matrix.columns, index = interactions_matrix.index)
song_predicted_plays_40.head()
# recommend the songs with the highest predicted plays
def recommend_songs(user_idx, interactions_matrix, preds_df, num_recommendations):
# Get and sort the user's ratings from the actual and predicted interaction matrix
sorted_user_plays = interactions_matrix.iloc[user_idx].sort_values(ascending=False)
sorted_user_predictions = preds_df.iloc[user_idx].sort_values(ascending=False)
#Creating a dataframe with actual and predicted ratings columns
temp = pd.concat([sorted_user_plays, sorted_user_predictions], axis=1)
temp.index.name = 'Recommended Songs'
temp.columns = ['user_plays', 'user_predictions']
#Filtering the dataframe where actual ratings are 0 which implies that the user has not interacted with that movie
temp = temp.loc[temp.user_plays == 0]
#Recommending movies with top predicted ratings
temp = temp.sort_values('user_predictions', ascending=False)
print('\nBelow are the recommended songs for user(user_id = {}):\n'.format(user_idx))
print(temp['user_predictions'].head(num_recommendations))
# return predicted number of plays, sorted by highest
recommend_songs(3125, interactions_matrix, song_predicted_plays_10, 5)
# display song information based on song id
song_df[song_df['song_id'].isin([365178, 329904, 15080, 350728, 297789])]
# create a dataframe containing average actual plays and avearge predicted plays of the songs
rmse_df = pd.concat([interactions_matrix.mean(), song_predicted_plays_10.mean(),
song_predicted_plays_20.mean(), song_predicted_plays_40.mean()], axis=1)
rmse_df.columns = ['avg_actual_plays', 'avg_predicted_plays_10', 'avg_predicted_plays_20', 'avg_predicted_plays_40']
rmse_df.head()
# low RMSE means predictions are close to actual number of plays
RMSE_10 = mean_squared_error(rmse_df['avg_actual_plays'], rmse_df['avg_predicted_plays_10'], squared=False)
RMSE_20 = mean_squared_error(rmse_df['avg_actual_plays'], rmse_df['avg_predicted_plays_20'], squared=False)
RMSE_40 = mean_squared_error(rmse_df['avg_actual_plays'], rmse_df['avg_predicted_plays_40'], squared=False)
print('\nRMSE SVD Model: 10 Latent Features = {} \n'.format(RMSE_10))
print('\nRMSE SVD Model: 20 Latent Features = {} \n'.format(RMSE_20))
print('\nRMSE SVD Model: 40 Latent Features = {} \n'.format(RMSE_40))
# recommend the songs with the highest predicted plays
def recommend_songs_2(user_idx, interactions_matrix, preds_df, num_recommendations):
# get and sort the user's ratings from the actual and predicted interaction matrix
sorted_user_plays = interactions_matrix.iloc[user_idx].sort_values(ascending=False)
sorted_user_predictions = preds_df.iloc[user_idx].sort_values(ascending=False)
# create a dataframe with actual and predicted plays columns
temp = pd.concat([sorted_user_plays, sorted_user_predictions], axis=1)
temp.index.name = 'Recommended Songs'
temp.columns = ['user_plays', 'user_predictions']
# filter the dataframe where actual ratings are 0 which implies that the user has not interacted with that song
temp = temp.loc[temp.user_plays == 0]
# recommend songs with top predicted plays
temp = temp.sort_values('user_predictions', ascending=False)
# modified return value so that it can be used as an object later
return temp.index[:num_recommendations]
def hybrid_recs(user_id, num_recommendations):
# use collaborative filtering for first set of recs
cf_recs = recommendations(user_id, num_recommendations, interactions_matrix)
# use matrix estimation for second set of recs
svd_recs = list(recommend_songs_2(user_id, interactions_matrix, song_predicted_plays_40, num_recommendations))
# combine all recs and keep unique ids
all_recs = np.unique(cf_recs+svd_recs)
# display combined recommended songs for the given user
return song_df[song_df['song_id'].isin(all_recs)]
hybrid_recs(19, 10)