Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_poor_man_boruta_selection(train_df, holdout_df, train_fn, eval_fn, base_extractor, metric_name):
features = ["x1", "x2", "x3", "x4", "x5", "x6"]
logs = poor_man_boruta_selection(train_df, holdout_df, train_fn,
features,
eval_fn, base_extractor, metric_name,
max_removed_by_step=1, threshold=0,
early_stop=10, iter_limit=50,
min_remaining_features=5)
assert len(get_used_features(first(logs))) <= 6 # Assert stop by remaining features
logs = poor_man_boruta_selection(train_df, holdout_df,
train_fn, features,
eval_fn, base_extractor, metric_name,
max_removed_by_step=1, threshold=0,
early_stop=10, iter_limit=1,
min_remaining_features=3)
assert len(logs) == 1 # Assert stop by iter limit
logs = poor_man_boruta_selection(train_df, holdout_df,
train_fn, features,
eval_fn, base_extractor, metric_name,
max_removed_by_step=1, threshold=1,
early_stop=2, iter_limit=50,
min_remaining_features=1)
assert len(logs) == 2 # Assert stop by early_stop
def test_get_used_features(logs):
result = get_used_features(logs[0])
assert result == ['x1', 'x2', 'x4', 'x5', 'x3', 'x6']
def test_backward_subset_feature_selection(train_df, train_fn, eval_fn, split_fn, base_extractor, metric_name):
features_sets = {"first": ["x1", "x2"], "second": ["x4", "x5"], "third": ["x3", "x6"]}
logs = backward_subset_feature_selection(train_df, train_fn, features_sets, split_fn, eval_fn, base_extractor,
metric_name,
num_removed_by_step=1, threshold=-1, early_stop=10, iter_limit=50,
min_remaining_features=5)
assert len(get_used_features(first(logs)[0])) <= 5 # Assert stop by remaining features
logs = backward_subset_feature_selection(train_df, train_fn, features_sets, split_fn, eval_fn, base_extractor,
metric_name,
num_removed_by_step=1, threshold=0, early_stop=10, iter_limit=1,
min_remaining_features=3)
assert len(logs) == 1 # Assert stop by iter limit
logs = backward_subset_feature_selection(train_df, train_fn, features_sets, split_fn, eval_fn, base_extractor,
metric_name,
num_removed_by_step=1, threshold=1, early_stop=2, iter_limit=50,
min_remaining_features=1)
assert len(logs) == 2 # Assert stop by early_stop
def test_feature_importance_backward_selection(train_df, train_fn, eval_fn, split_fn, base_extractor, metric_name):
features = ["x1", "x2", "x3", "x4", "x5", "x6"]
logs = feature_importance_backward_selection(train_df, train_fn, features, split_fn, eval_fn,
base_extractor, metric_name,
num_removed_by_step=1, threshold=0,
early_stop=10, iter_limit=50, min_remaining_features=5)
assert len(get_used_features(first(logs))) <= 5 # Assert stop by remaining features
logs = feature_importance_backward_selection(train_df, train_fn, features, split_fn, eval_fn,
base_extractor, metric_name,
num_removed_by_step=1, threshold=0,
early_stop=10, iter_limit=1, min_remaining_features=3)
assert len(logs) == 1 # Assert stop by iter limit
logs = feature_importance_backward_selection(train_df, train_fn, features, split_fn, eval_fn, base_extractor,
metric_name,
num_removed_by_step=1, threshold=1, early_stop=2, iter_limit=50,
min_remaining_features=1)
assert len(logs) == 2 # Assert stop by early_stop
seed: int (default 7)
Random seed
Returns
----------
features: list of str
The remaining features after removing based on feature importance
"""
random.seed(seed)
curr_metric = get_avg_metric_from_extractor(log, extractor, metric_name)
eval_size = eval_data.shape[0]
features_to_shuffle = order_feature_importance_avg_from_logs(log)[-max_removed_by_step:] \
if speed_up_by_importance else get_used_features(log)
def shuffle(feature: str) -> pd.DataFrame:
return eval_data.assign(**{feature: eval_data[feature].sample(frac=1.0)})
feature_to_delta_metric = compose(lambda m: curr_metric - m,
get_avg_metric_from_extractor(extractor=extractor, metric_name=metric_name),
gen_validator_log(fold_num=0, test_size=eval_size), eval_fn, predict_fn, shuffle)
if parallel:
metrics = Parallel(n_jobs=nthread, backend="threading")(
delayed(feature_to_delta_metric)(feature) for feature in features_to_shuffle)
feature_to_delta_metric = dict(zip(features_to_shuffle, metrics))
gc.collect()
else:
feature_to_delta_metric = {feature: feature_to_delta_metric(feature) for feature in features_to_shuffle}
Parameters
----------
logs : list of list of dict
A list of log-like lists of dictionaries evaluations.
min_num_features: int (default 50)
The minimun number of features the model can have before stopping
Returns
-------
stop: bool
A boolean whether to stop recursion or not
"""
return len(get_used_features(first(logs))) <= min_num_features