-
Notifications
You must be signed in to change notification settings - Fork 0
/
book-recommender.py
206 lines (169 loc) · 10.7 KB
/
book-recommender.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# import dependent libraries
import pandas as pd
from scipy.sparse import csr_matrix
import numpy as np
import warnings
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k
from lightfm import LightFM
import random
#ignore warnings
warnings.filterwarnings("ignore")
###############################################################################################################################
# function to sample recommendations to a given user
def sample_recommendation_user(model, interactions, metadata, user_id, user_dict,
item_dict, threshold = 0, nrec_items = 5, show = True):
n_users, n_items = interactions.shape
user_x = user_dict[user_id]
scores = pd.Series(model.predict(user_x, np.arange(n_items), item_features=metadata))
scores.index = interactions.columns
scores = list(pd.Series(scores.sort_values(ascending=False).index))
known_items = list(pd.Series(interactions.loc[user_id,:] \
[interactions.loc[user_id,:] > threshold].index).sort_values(ascending=False))
scores = [x for x in scores if x not in known_items]
return_score_list = scores[0:nrec_items]
known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
if show:
print ("\nUser: " + str(user_id))
print("Known Likes:")
counter = 1
for i in known_items:
print(str(counter) + '- ' + i)
counter+=1
print("Recommended Items:")
counter = 1
for i in scores:
print(str(counter) + '- ' + i)
counter+=1
###############################################################################################################################
# import books metadata
books_metadata = pd.read_json('./data-books/goodreads_books_poetry.json', lines=True)
# import user-book interactions
interactions = pd.read_json('./data-books/goodreads_interactions_poetry.json', lines=True)
############################################################################################################################### work on books metadata
print('\nBooks metadata:')
print('Fields:')
print(books_metadata.columns.values)
print('Samples from dataset:')
print(books_metadata.sample(2))
print('Shape of books metadata')
print(books_metadata.shape)
# Limit the books metadata to selected fields
print('Select only some fields from books metadata dataset')
books_metadata_selected = books_metadata[['book_id', 'average_rating', 'is_ebook', 'num_pages',
'publication_year', 'ratings_count', 'language_code']]
print(books_metadata_selected.sample(2).to_string(index=False))
print('Perform some manipulation on data...')
# using pandas cut method to convert fields into discrete intervals
books_metadata_selected['num_pages'].replace(np.nan, -1, inplace=True)
books_metadata_selected['num_pages'].replace('', -1, inplace=True)
books_metadata_selected['num_pages'] = pd.to_numeric(books_metadata_selected['num_pages'])
books_metadata_selected['num_pages'] = pd.cut(books_metadata_selected['num_pages'], bins=25)
# rounding ratings to neares .5 score
books_metadata_selected['average_rating'] = books_metadata_selected['average_rating'].apply(lambda x: round(x*2)/2)
# using pandas qcut method to convert fields into quantile-based discrete intervals
books_metadata_selected['ratings_count'] = pd.qcut(books_metadata_selected['ratings_count'], 25)
# replacing missing values to year 2100
books_metadata_selected['publication_year'].replace(np.nan, 2100, inplace=True)
books_metadata_selected['publication_year'].replace('', 2100, inplace=True)
# replacing missing values to 'unknown'
books_metadata_selected['language_code'].replace(np.nan, 'unknown', inplace=True)
books_metadata_selected['language_code'].replace('', 'unknown', inplace=True)
books_metadata_selected['language_code'].replace('en-GB', 'eng', inplace=True)
books_metadata_selected['language_code'].replace('en-US', 'eng', inplace=True)
books_metadata_selected['language_code'].replace('it-IT', 'ita', inplace=True)
# convert is_ebook column into 1/0 where true=1 and false=0
books_metadata_selected['is_ebook'] = books_metadata_selected.is_ebook.map(lambda x: 1*(x == 'true'))
print('Books metadata after some manipulation')
print(books_metadata_selected.sample(5).to_string(index=False))
############################################################################################################################### work on interactions data
print('\nInteractions data:')
print('Fields')
print(interactions.columns.values)
print('Some samples from dataset')
print(interactions.sample(2).to_string(index=False))
print('Shape of interactions data')
print(interactions.shape)
# Limit the interactions data to selected fields
print('Select only some fields from interactions dataset')
interactions_selected = interactions[['user_id', 'book_id', 'is_read', 'rating']]
print(interactions_selected.sample(2).to_string(index=False))
print('Perform some manipulation on data...')
# convert is_read column into 1/0 where True=1 and False=0
interactions_selected['is_read'] = interactions_selected.is_read.map(lambda x: 1*(x == True))
print('Interaction data after some manipulation')
print(interactions_selected.sample(5).to_string(index=False))
############################################################################################################################### some considerations before processing
# Since we have two fields denoting interaction between a user and a book, is_read and rating
# let's see how many data points we have where the user hasn't read the book but have given the ratings.
print('\nUsers ratings (columns) divided by having actually read the book (2 rows)')
interactions_counts = interactions_selected.groupby(['rating', 'is_read']).size().reset_index().pivot(columns='rating', index='is_read', values=0)
print(interactions_counts.to_string())
# From the above results, we can conclusively infer that users with ratings >= 1 have all read the book.
# Therefore, we'll use the ratings as the final score, drop interactions where is_read is false,
# and limit interactions from random 500 users to limit the data size for further analysis
print('\nSelect only user-item interactions with users that have actually read the book, and a limited number (to simplify calculations)')
interactions_selected = interactions_selected.loc[interactions_selected['is_read']==1, ['user_id', 'book_id', 'rating']]
n_users = 5000
interactions_selected = interactions_selected[interactions_selected['user_id'].isin(random.sample(list(interactions_selected['user_id'].unique()), k=n_users))]
print(interactions_selected.sample(10).to_string(index=False))
print('Final interactions dataset shape')
print(interactions_selected.shape)
############################################################################################################################### data processing
# Now, let's transform the available data into CSR sparse matrix that can be used for matrix operations.
# We will start by the process by creating books_metadata matrix which is np.float64 csr_matrix of shape ([n_books, n_books_features])
# Each row contains that book's weights over features. However, before we create a sparse matrix, we'll first create a item dictionary for future references
item_dict ={}
df = books_metadata[['book_id', 'title']].sort_values('book_id').reset_index()
for i in range(df.shape[0]):
item_dict[(df.loc[i,'book_id'])] = df.loc[i,'title']
# dummify categorical features
books_metadata_selected_transformed = pd.get_dummies(books_metadata_selected, columns = ['average_rating', 'is_ebook', 'num_pages',
'publication_year', 'ratings_count',
'language_code'])
books_metadata_selected_transformed = books_metadata_selected_transformed.sort_values('book_id').reset_index().drop('index', axis=1)
print('First rows of books metadata transformed')
print(books_metadata_selected_transformed.head(5))
# convert to csr matrix
books_metadata_csr = csr_matrix(books_metadata_selected_transformed.drop('book_id', axis=1).values)
print('Sparse matrix of books metadata')
print(repr(books_metadata_csr))
# Next we'll create an iteractions matrix which is np.float64 csr_matrix of shape ([n_users, n_books]).
# We'll also create a user dictionary for future use cases
user_book_interaction = pd.pivot_table(interactions_selected, index='user_id', columns='book_id', values='rating')
# fill missing values with 0
user_book_interaction = user_book_interaction.fillna(0)
print('First rows of interactions data transformed')
print(user_book_interaction.head(5))
user_id = list(user_book_interaction.index)
user_dict = {}
counter = 0
for i in user_id:
user_dict[i] = counter
counter += 1
# convert to csr matrix
user_book_interaction_csr = csr_matrix(user_book_interaction.values)
print('Sparse matrix of interactions data')
print(repr(user_book_interaction_csr))
############################################################################################################################### model
model = LightFM(loss='warp', random_state=2016, learning_rate=0.05, no_components=150, user_alpha=0.000005)
model = model.fit(user_book_interaction_csr, item_features=books_metadata_csr, epochs=100, num_threads=16, verbose=False)
n_users=3
n_rec_items=5
users_ids_list = interactions_selected['user_id'].iloc[0:n_users]
for user_id in users_ids_list:
sample_recommendation_user(model, user_book_interaction, books_metadata_csr, user_id, user_dict, item_dict, nrec_items=n_rec_items)
# calculate metrics and evaluate model
patk = precision_at_k(model, user_book_interaction_csr, train_interactions=None, k=n_rec_items,
user_features=None, item_features=books_metadata_csr, preserve_rows=True, num_threads=1, check_intersections=True)
ratk = recall_at_k(model, user_book_interaction_csr, train_interactions=None, k=n_rec_items,
user_features=None, item_features=books_metadata_csr, preserve_rows=True, num_threads=1, check_intersections=True)
mean_auc_score = auc_score(model, user_book_interaction_csr, item_features=books_metadata_csr, user_features=None, num_threads=1).mean()
print('\nFor first ',n_users, ' users:')
print('\nPrecision at k (proportion of recommended items in the top-k set that are relevant) with k = ', n_rec_items)
print(patk[0:n_users])
print('\nRecall at k (proportion of relevant items found in the top-k recommendations) with k = ', n_rec_items)
print(ratk[0:n_users])
print('\nMean AUC score')
print(mean_auc_score)