Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def sequential_forward_selection(self, number_features_to_keep):
"""
Does forward selection
""" # TODO: write better docstring
sfs = SFS(self.model, k_features=number_features_to_keep, forward=True, floating=False,
verbose=0, scoring='neg_mean_squared_error', cv=KFold(n_splits=5, shuffle=True,
random_state=False))
sfs = sfs.fit(X=np.array(self.dataframe[self.x_features]),
y=np.array(self.dataframe[self.y_feature]))
sfs.fit_transform(X=np.array(self.dataframe[self.x_features]),
y=np.array(self.dataframe[self.y_feature]))
feature_indices_selected = sfs.k_feature_idx_
x_features_to_keep = []
for index in feature_indices_selected:
x_features_to_keep.append(self.x_features[index])
dataframe = FeatureIO(dataframe=self.dataframe).keep_custom_features(features_to_keep=x_features_to_keep)
# Add y_feature back into the dataframe
dataframe = FeatureIO(dataframe=dataframe)\
.add_custom_features(features_to_add=[self.y_feature],
data_to_add=self.dataframe[self.y_feature])
else:
# Obtain frozen+potentially active parameters list
frz_pot_par = sset(active_par_data)
frz_pot_par.update(self._pipeline._pot_active_par)
frz_pot_par = list(frz_pot_par)
frz_pot_idx = list(range(len(frz_pot_par)))
# Obtain non-frozen potentially active parameters
non_frz_par = [par for par in self._pipeline._pot_active_par
if par not in active_par_data]
non_frz_idx = [frz_pot_par.index(par) for par in non_frz_par]
# If non_frz_par has at least 1 element, carry out analysis
if non_frz_par:
# Create SequentialFeatureSelector object
sfs_obj = SFS(LR(), k_features='parsimonious',
forward=False, floating=False, scoring='r2',
cv=self._n_cross_val)
# Obtain sam_set of frz_pot_par
frz_pot_sam_set = self._sam_set[emul_i][:, frz_pot_par]
# Obtain polynomial terms of frz_pot_sam_set
pf_obj = PF(self._poly_order, include_bias=False)
frz_pot_poly_terms = pf_obj.fit_transform(frz_pot_sam_set)
# Perform linear regression with linear terms only
sfs_obj.fit(frz_pot_sam_set, self._mod_set[emul_i][emul_s])
# Extract active parameters due to linear significance
act_idx_lin = list(sfs_obj.k_feature_idx_)
# plt.plot(Xdata, ydata, '-o', color='r', label='Avg RMSE 10 tests 5-fold CV')
# plt.fill_between(Xdata, np.array(ydata) - np.array(yspread),
# np.array(ydata) + np.array(yspread), alpha=0.1,
# color="r")
# plt.xlabel("Number of features")
# plt.ylabel("RMSE")
# plt.legend(loc="best")
# plt.savefig(savedir + "/" + "basic_forward_selection_learning_curve_featurenumber.png", dpi=250)
# return
# Include Principal Component Analysis
PCA.transform = dataframify_new_column_names(PCA.transform, 'pca_')
# Include Sequential Forward Selector
SequentialFeatureSelector.transform = dataframify_new_column_names(SequentialFeatureSelector.transform, 'sfs_')
SequentialFeatureSelector.fit = fitify_just_use_values(SequentialFeatureSelector.fit)
model_selectors['SequentialFeatureSelector'] = SequentialFeatureSelector
name_to_constructor['SequentialFeatureSelector'] = SequentialFeatureSelector
# Custom selectors don't need to be dataframified
name_to_constructor.update({
# 'PassThrough': PassThrough,
'DoNothing': util_legos.DoNothing,
'PCA': PCA,
'SequentialFeatureSelector': SequentialFeatureSelector,
'MASTMLFeatureSelector': MASTMLFeatureSelector,
})
# plt.fill_between(Xdata, np.array(ydata) - np.array(yspread),
# np.array(ydata) + np.array(yspread), alpha=0.1,
# color="r")
# plt.xlabel("Number of features")
# plt.ylabel("RMSE")
# plt.legend(loc="best")
# plt.savefig(savedir + "/" + "basic_forward_selection_learning_curve_featurenumber.png", dpi=250)
# return
# Include Principal Component Analysis
PCA.transform = dataframify_new_column_names(PCA.transform, 'pca_')
# Include Sequential Forward Selector
SequentialFeatureSelector.transform = dataframify_new_column_names(SequentialFeatureSelector.transform, 'sfs_')
SequentialFeatureSelector.fit = fitify_just_use_values(SequentialFeatureSelector.fit)
model_selectors['SequentialFeatureSelector'] = SequentialFeatureSelector
name_to_constructor['SequentialFeatureSelector'] = SequentialFeatureSelector
# Custom selectors don't need to be dataframified
name_to_constructor.update({
# 'PassThrough': PassThrough,
'DoNothing': util_legos.DoNothing,
'PCA': PCA,
'SequentialFeatureSelector': SequentialFeatureSelector,
'MASTMLFeatureSelector': MASTMLFeatureSelector,
})
poly_idx : 1D :obj:`~numpy.ndarray` object
Array containing the indices of the non-zero polynomial terms in
the regression function.
poly_coef_cov : 1D :obj:`~numpy.ndarray` object (if \
:attr:`~use_regr_cov` is *True*)
Array containing the covariance values of the non-zero polynomial
coefficients.
"""
# Create logger
logger = getRLogger('REGRESSION')
logger.info("Performing regression.")
# Create SequentialFeatureSelector object
sfs_obj = SFS(LR(), k_features='best', forward=True, floating=False,
scoring='neg_mean_squared_error',
cv=self._n_cross_val)
# Create Scikit-learn Pipeline object
# The bias/intercept/constant-term is not included in the SFS object to
# ensure that it is taken into account in the linear regression, since
# it is required for getting the residual variance. It also ensures
# that the SFS does not focus on the constant-term in its calculations.
pipe = Pipeline_sk([('poly', PF(self._poly_order, include_bias=False)),
('SFS', sfs_obj),
('linear', LR())])
# Loop over all emulator systems and perform a regression on them
for emul_s in emul_s_seq:
# Extract active_sam_set
active_sam_set = self._sam_set[emul_i][
# get all additional entries for the options
# opts.update(kwopts)
# retrieve a classifier object
classifier_obj = available_classifiers[classifier](**opts)
# extract the backend classifier
clf = classifier_obj.clf
else:
# if we received a classifier object we'll just use this one
clf = classifier.clf
if selection_type == 'SFS':
algorithm = "Sequential Forward Selection (SFS)"
sfs = SFS(clf, k_features, forward=True, floating=False,
verbose=2, scoring='accuracy', cv=kfold, n_jobs=-1)
elif selection_type == 'SBS':
algorithm = "Sequential Backward Selection (SBS)"
sfs = SFS(clf, k_features, forward=False, floating=False,
verbose=2, scoring='accuracy', cv=kfold, n_jobs=-1)
elif selection_type == 'SFFS':
algorithm = "Sequential Forward Floating Selection (SFFS)"
sfs = SFS(clf, k_features, forward=True, floating=True,
verbose=2, scoring='accuracy', cv=kfold, n_jobs=-1)
elif selection_type == 'SBFS':
algorithm = "Sequential Backward Floating Selection (SFFS)"
sfs = SFS(clf, k_features, forward=False, floating=True,
verbose=2, scoring='accuracy', cv=kfold, n_jobs=-1)