Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# http://www.benfrederickson.com/approximate-nearest-neighbours-for-recommender-systems/ # noqa
# The dataset is from "Last.fm Dataset - 360K users":
# http://www.dtic.upf.edu/~ocelma/MusicRecommendationDataset/lastfm-360K.html # noqa
# This requires the implicit package to generate the factors
# (on my desktop/gpu this only takes 4-5 seconds to train - but
# could take 1-2 minutes on a laptop)
from implicit.datasets.lastfm import get_lastfm
from implicit.approximate_als import augment_inner_product_matrix
import implicit
# train an als model on the lastfm data
_, _, play_counts = get_lastfm()
model = implicit.als.AlternatingLeastSquares(factors=n_dimensions)
model.fit(implicit.nearest_neighbours.bm25_weight(
play_counts, K1=100, B=0.8))
# transform item factors so that each one has the same norm,
# and transform the user factors such by appending a 0 column
_, item_factors = augment_inner_product_matrix(model.item_factors)
user_factors = numpy.append(model.user_factors,
numpy.zeros((model.user_factors.shape[0], 1)),
axis=1)
# only query the first 50k users (speeds things up signficantly
# without changing results)
user_factors = user_factors[:test_size]
# after that transformation a cosine lookup will return the same results
# as the inner product on the untransformed data
write_output(item_factors, user_factors, out_fn, 'angular')
def run_benchmark(args):
plays = bm25_weight(scipy.io.mmread(args.inputfile))
qmf_time = benchmark_qmf(args.qmfpath, plays, args.factors, args.regularization,
args.iterations)
implicit_time = benchmark_implicit(plays, args.factors, args.regularization, args.iterations)
print("QMF finished in", qmf_time)
print("Implicit finished in", implicit_time)
print("Implicit is %s times faster" % (qmf_time / implicit_time))
def calculate_similar_artists(output_filename, model_name="als"):
""" generates a list of similar artists in lastfm by utiliizing the 'similar_items'
api of the models """
artists, users, plays = get_lastfm()
# create a model from the input data
model = get_model(model_name)
# if we're training an ALS based model, weight input for last.fm
# by bm25
if issubclass(model.__class__, AlternatingLeastSquares):
# lets weight these models by bm25weight.
logging.debug("weighting matrix by bm25_weight")
plays = bm25_weight(plays, K1=100, B=0.8)
# also disable building approximate recommend index
model.approximate_recommend = False
# this is actually disturbingly expensive:
plays = plays.tocsr()
logging.debug("training model %s", model_name)
start = time.time()
model.fit(plays)
logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start)
# write out similar artists by popularity
start = time.time()
logging.debug("calculating top artists")
def calculate_recommendations(output_filename, model_name="als"):
""" Generates artist recommendations for each user in the dataset """
# train the model based off input params
artists, users, plays = get_lastfm()
# create a model from the input data
model = get_model(model_name)
# if we're training an ALS based model, weight input for last.fm
# by bm25
if issubclass(model.__class__, AlternatingLeastSquares):
# lets weight these models by bm25weight.
logging.debug("weighting matrix by bm25_weight")
plays = bm25_weight(plays, K1=100, B=0.8)
# also disable building approximate recommend index
model.approximate_similar_items = False
# this is actually disturbingly expensive:
plays = plays.tocsr()
logging.debug("training model %s", model_name)
start = time.time()
model.fit(plays)
logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start)
# generate recommendations for each user and write out to a file
start = time.time()
user_plays = plays.T.tocsr()
with tqdm.tqdm(total=len(users)) as progress:
parser.add_argument('--input', type=str, required=True,
dest='inputfile', help='dataset file in matrix market format')
parser.add_argument('--graph', help='generates graphs',
action="store_true")
parser.add_argument('--loss', help='test training loss',
action="store_true")
parser.add_argument('--speed', help='test training speed',
action="store_true")
args = parser.parse_args()
if not (args.speed or args.loss):
print("must specify at least one of --speed or --loss")
parser.print_help()
else:
plays = bm25_weight(scipy.io.mmread(args.inputfile)).tocsr()
logging.basicConfig(level=logging.DEBUG)
if args.loss:
acc = benchmark_accuracy(plays)
json.dump(acc, open("als_accuracy.json", "w"))
if args.graph:
generate_loss_graph(acc, "als_accuracy.png")
if args.speed:
speed = benchmark_times(plays)
json.dump(speed, open("als_speed.json", "w"))
if args.graph:
generate_speed_graph(speed, "als_speed.png")
def _set_implib_train_mat(self, train_mat):
# implib ALS expects matrix in items x users format
self.implib_train_mat = train_mat.T
if self.fit_params['use_bm25']:
self.implib_train_mat = bm25_weight(
self.implib_train_mat,
K1=self.fit_params['bm25_k1'],
B=self.fit_params['bm25_b'])
self.model.regularization = \
self.fit_params['regularization'] * self.implib_train_mat.nnz
# remove things < min_rating, and convert to implicit dataset
# by considering ratings as a binary preference only
ratings.data[ratings.data < min_rating] = 0
ratings.eliminate_zeros()
ratings.data = np.ones(len(ratings.data))
log.info("read data file in %s", time.time() - start)
# generate a recommender model based off the input params
if model_name == "als":
model = AlternatingLeastSquares()
# lets weight these models by bm25weight.
log.debug("weighting matrix by bm25_weight")
ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr()
elif model_name == "bpr":
model = BayesianPersonalizedRanking()
elif model_name == "lmf":
model = LogisticMatrixFactorization()
elif model_name == "tfidf":
model = TFIDFRecommender()
elif model_name == "cosine":
model = CosineRecommender()
elif model_name == "bm25":
model = BM25Recommender(B=0.2)