Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
data = pd.read_csv("./movielens_sample.txt")
sparse_features = ["movie_id", "user_id",
"gender", "age", "occupation", "zip"]
target = ['rating']
# 1.Label Encoding for sparse features,and do simple Transformation for dense features
for feat in sparse_features:
lbe = LabelEncoder()
data[feat] = lbe.fit_transform(data[feat])
# 2.count #unique features for each sparse field
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(),embedding_dim=4)
for feat in sparse_features]
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
# 3.generate input data for model
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}
# 4.Define Model,train,predict and evaluate
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(train_model_input, train[target].values,
batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
pred_ans = model.predict(test_model_input, batch_size=256)
print("test MSE", round(mean_squared_error(
test[target].values, pred_ans), 4))
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(),embedding_dim=4)
for feat in sparse_features]
use_weighted_sequence = False
if use_weighted_sequence:
varlen_feature_columns = [VarLenSparseFeat('genres', maxlen= max_len,vocabulary_size=len(
key2index) + 1,embedding_dim=4, combiner='mean',weight_name='genres_weight')] # Notice : value 0 is for padding for sequence input feature
else:
varlen_feature_columns = [VarLenSparseFeat('genres', maxlen=max_len,vocabulary_size= len(
key2index) + 1,embedding_dim=4, combiner='mean',weight_name=None)] # Notice : value 0 is for padding for sequence input feature
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
feature_names = get_feature_names(linear_feature_columns+dnn_feature_columns)
# 3.generate input data for model
model_input = {name:data[name] for name in sparse_features}#
model_input["genres"] = genres_list
model_input["genres_weight"] = np.random.randn(data.shape[0],max_len,1)
# 4.Define Model,compile and train
model = DeepFM(linear_feature_columns,dnn_feature_columns,task='regression')
model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(model_input, data[target].values,
batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
genres_list = list(map(lambda x: x.split('|'), data['genres'].values))
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length)
# Notice : padding=`post`
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', dtype=str, value=0)
# 2.set hashing space for each sparse field and generate feature config for sequence feature
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique() * 5,embedding_dim=4, use_hash=True, dtype='string')
for feat in sparse_features]
varlen_feature_columns = [VarLenSparseFeat('genres', maxlen=max_len,vocabulary_size=100,embedding_dim=4,combiner= 'mean', use_hash=True,
dtype="string")] # Notice : value 0 is for padding for sequence input feature
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
# 3.generate input data for model
model_input = {name:data[name] for name in feature_names}
model_input['genres'] = genres_list
# 4.Define Model,compile and train
model = DeepFM(linear_feature_columns,dnn_feature_columns, task='regression')
model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(model_input, data[target].values,
batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
data[dense_features] = data[dense_features].fillna(0, )
target = ['label']
# 1.do simple Transformation for dense features
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])
# 2.set hashing space for each sparse field,and record dense feature field name
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=1000,embedding_dim=4, use_hash=True, dtype='string') # since the input is string
for feat in sparse_features] + [DenseFeat(feat, 1, )
for feat in dense_features]
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns, )
# 3.generate input data for model
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name:train[name] for name in feature_names}
test_model_input = {name:test[name] for name in feature_names}
# 4.Define Model,train,predict and evaluate
model = DeepFM(linear_feature_columns,dnn_feature_columns, task='binary')
model.compile("adam", "binary_crossentropy",
metrics=['binary_crossentropy'], )
history = model.fit(train_model_input, train[target].values,
batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
hist_iid = np.array([[ 1, 2, 3,0], [ 1, 2, 3,0], [ 1, 2, 0,0]])
hist_igender = np.array([[1, 1, 2,0 ], [2, 1, 1, 0], [2, 1, 0, 0]])
behavior_length = np.array([3,3,2])
feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender,
'hist_item': hist_iid, 'hist_item_gender': hist_igender,
'score': score,"seq_length":behavior_length}
if use_neg:
feature_dict['neg_hist_item'] = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
feature_dict['neg_hist_item_gender'] = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])
feature_columns += [VarLenSparseFeat('neg_hist_item', maxlen=4,vocabulary_size=3+1,embedding_dim=8, embedding_name='item',length_name="seq_length"),
VarLenSparseFeat('neg_hist_item_gender', maxlen=4, vocabulary_size=3+1,embedding_dim=4,embedding_name='item_gender',length_name="seq_length")]
x = {name:feature_dict[name] for name in get_feature_names(feature_columns)}
y = [1, 0, 1]
return x, y, feature_columns, behavior_feature_list
igender = np.array([1, 2, 1]) # 0 is mask value
score = np.array([0.1, 0.2, 0.3])
sess1_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [0, 0, 0, 0]])
sess1_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [0, 0, 0, 0]])
sess2_iid = np.array([[1, 2, 3, 0], [0, 0, 0, 0], [0, 0, 0, 0]])
sess2_igender = np.array([[1, 1, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0]])
sess_number = np.array([2, 1, 0])
feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender,
'sess_0_item': sess1_iid, 'sess_0_item_gender': sess1_igender, 'score': score,
'sess_1_item': sess2_iid, 'sess_1_item_gender': sess2_igender, }
x = {name:feature_dict[name] for name in get_feature_names(feature_columns)}
x["sess_length"] = sess_number
y = [1, 0, 1]
return x, y, feature_columns, behavior_feature_list
feature_columns += [VarLenSparseFeat('hist_item', maxlen=4, vocabulary_size=3+1, embedding_dim=8,embedding_name='item'),
VarLenSparseFeat('hist_item_gender', maxlen=4,vocabulary_size=3+1,embedding_dim=4, embedding_name='item_gender')]
behavior_feature_list = ["item", "item_gender"]
uid = np.array([0, 1, 2])
ugender = np.array([0, 1, 0])
iid = np.array([1, 2, 3]) # 0 is mask value
igender = np.array([1, 2, 1]) # 0 is mask value
score = np.array([0.1, 0.2, 0.3])
hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])
feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender,
'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score}
x = {name:feature_dict[name] for name in get_feature_names(feature_columns)}
y = [1, 0, 1]
return x, y, feature_columns, behavior_feature_list