Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_random_train_test_split(test_percentage):
data = fetch_movielens()["train"]
train, test = random_train_test_split(data, test_percentage=test_percentage)
assert test.nnz / float(data.nnz) == test_percentage
_assert_disjoint(train, test)
def test_movielens_genre_accuracy():
item_features = fetch_movielens(indicator_features=False, genre_features=True)[
"item_features"
]
assert item_features.shape[1] < item_features.shape[0]
model = LightFM(random_state=SEED)
model.fit_partial(train, item_features=item_features, epochs=10)
train_predictions = model.predict(train.row, train.col, item_features=item_features)
test_predictions = model.predict(test.row, test.col, item_features=item_features)
assert roc_auc_score(train.data, train_predictions) > 0.75
assert roc_auc_score(test.data, test_predictions) > 0.69
def get_movielens_100k(min_positive_score=4, negative_value=0):
movielens_100k_dict = datasets.fetch_movielens(indicator_features=True, genre_features=True)
def flip_ratings(ratings_matrix):
ratings_matrix.data = np.array([1 if rating >= min_positive_score else negative_value
for rating in ratings_matrix.data])
return ratings_matrix
test_interactions = flip_ratings(movielens_100k_dict['test'])
train_interactions = flip_ratings(movielens_100k_dict['train'])
# Create indicator features for all users
num_users = train_interactions.shape[0]
user_features = sp.identity(num_users)
# Movie titles
titles = movielens_100k_dict['item_labels']
user_features = sp.identity(no_users, dtype=np.int32).tocsr()
item_features = sp.identity(no_items, dtype=np.int32).tocsr()
return (user_features.tocsr(), item_features.tocsr())
def _binarize(dataset):
positives = dataset.data >= 4.0
dataset.data[positives] = 1.0
dataset.data[np.logical_not(positives)] = -1.0
return dataset
movielens = fetch_movielens()
train, test = _binarize(movielens["train"]), _binarize(movielens["test"])
(train_user_features, train_item_features) = _get_feature_matrices(train)
(test_user_features, test_item_features) = _get_feature_matrices(test)
def test_movielens_accuracy():
model = LightFM(random_state=SEED)
model.fit_partial(train, epochs=10)
train_predictions = model.predict(train.row, train.col)
test_predictions = model.predict(test.row, test.col)
assert roc_auc_score(train.data, train_predictions) > 0.84
def test_movielens_both_accuracy():
"""
Accuracy with both genre metadata and item-specific
features shoul be no worse than with just item-specific
features (though more training may be necessary).
"""
item_features = fetch_movielens(indicator_features=True, genre_features=True)[
"item_features"
]
model = LightFM(random_state=SEED)
model.fit_partial(train, item_features=item_features, epochs=15)
train_predictions = model.predict(train.row, train.col, item_features=item_features)
test_predictions = model.predict(test.row, test.col, item_features=item_features)
assert roc_auc_score(train.data, train_predictions) > 0.84
assert roc_auc_score(test.data, test_predictions) > 0.75
def test_basic_fetching_movielens():
data = fetch_movielens()
assert isinstance(data["train"], sp.coo_matrix)
assert isinstance(data["test"], sp.coo_matrix)
assert data["train"].shape == data["test"].shape
assert data["train"].shape == (943, 1682)
assert (data["train"].getnnz() + data["test"].getnnz()) == 100000
assert data["item_features"].shape == (1682, 1682)
assert len(data["item_feature_labels"]) == 1682
assert data["item_feature_labels"] is data["item_labels"]
data = fetch_movielens(genre_features=True)
assert data["item_features"].shape == (1682, len(data["item_feature_labels"]))
assert data["item_feature_labels"] is not data["item_labels"]
with pytest.raises(ValueError):
data = fetch_movielens(indicator_features=False, genre_features=False)
def test_basic_fetching_movielens():
data = fetch_movielens()
assert isinstance(data["train"], sp.coo_matrix)
assert isinstance(data["test"], sp.coo_matrix)
assert data["train"].shape == data["test"].shape
assert data["train"].shape == (943, 1682)
assert (data["train"].getnnz() + data["test"].getnnz()) == 100000
assert data["item_features"].shape == (1682, 1682)
assert len(data["item_feature_labels"]) == 1682
assert data["item_feature_labels"] is data["item_labels"]
data = fetch_movielens(genre_features=True)
assert data["item_features"].shape == (1682, len(data["item_feature_labels"]))
assert data["item_feature_labels"] is not data["item_labels"]
assert data["train"].shape == data["test"].shape
assert data["train"].shape == (943, 1682)
assert (data["train"].getnnz() + data["test"].getnnz()) == 100000
assert data["item_features"].shape == (1682, 1682)
assert len(data["item_feature_labels"]) == 1682
assert data["item_feature_labels"] is data["item_labels"]
data = fetch_movielens(genre_features=True)
assert data["item_features"].shape == (1682, len(data["item_feature_labels"]))
assert data["item_feature_labels"] is not data["item_labels"]
with pytest.raises(ValueError):
data = fetch_movielens(indicator_features=False, genre_features=False)
import numpy as np
from lightfm.datasets import fetch_movielens
from lightfm import LightFM
#CHALLENGE part 1 of 3 - write your own fetch and format method for a different recommendation
#dataset. Here a good few https://gist.github.com/entaroadun/1653794
#And take a look at the fetch_movielens method to see what it's doing
#
#fetch data and format it
data = fetch_movielens(min_rating=4.0)
#print training and testing data
print(repr(data['train']))
print(repr(data['test']))
#CHALLENGE part 2 of 3 - use 3 different loss functions (so 3 different models), compare results, print results for
#the best one. - Available loss functions are warp, logistic, bpr, and warp-kos.
#create model
model = LightFM(loss='warp')
#train model
model.fit(data['train'], epochs=30, num_threads=2)
#CHALLENGE part 3 of 3 - Modify this function so that it parses your dataset correctly to retrieve