Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
trn.drop(TARGET_COL, axis=1, inplace=True)
cat_cols = [x for x in trn.columns if trn[x].dtype == np.object]
num_cols = [x for x in trn.columns if trn[x].dtype != np.object]
logging.info('categorical: {}, numerical: {}'.format(len(cat_cols),
len(num_cols)))
df = pd.concat([trn, tst], axis=0)
logging.info('normalizing numeric features')
nm = Normalizer()
df[num_cols] = nm.fit_transform(df[num_cols].values)
logging.info('label encoding categorical variables')
ohe = OneHotEncoder(min_obs=10)
X_ohe = ohe.fit_transform(df[cat_cols])
ohe_cols = ['ohe{}'.format(i) for i in range(X_ohe.shape[1])]
X = sparse.hstack((df[num_cols].values, X_ohe), format='csr')
with open(feature_map_file, 'w') as f:
for i, col in enumerate(num_cols + ohe_cols):
f.write('{}\t{}\tq\n'.format(i, col))
logging.info('saving features')
save_data(X[:n_trn,], y, train_feature_file)
save_data(X[n_trn:,], None, test_feature_file)