Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
hyperparameters = {'GBM': gbm_options, 'NN': nn_options}
num_trials = 3
time_limits = 30
###################################################################
# Each train/test dataset must be located in single directory with the given names.
train_file = 'train_data.csv'
test_file = 'test_data.csv'
seed_val = 0 # random seed
EPS = 1e-10
# Information about each dataset in benchmark is stored in dict.
# performance_val = expected performance on this dataset (lower = better),should update based on previously run benchmarks
binary_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/AdultIncomeBinaryClassification.zip',
'name': 'AdultIncomeBinaryClassification',
'problem_type': BINARY,
'label_column': 'class',
'performance_val': 0.129} # Mixed types of features.
multi_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/CoverTypeMulticlassClassification.zip',
'name': 'CoverTypeMulticlassClassification',
'problem_type': MULTICLASS,
'label_column': 'Cover_Type',
'performance_val': 0.032} # big dataset with 7 classes, all features are numeric. Runs SLOW.
regression_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/AmesHousingPriceRegression.zip',
'name': 'AmesHousingPriceRegression',
'problem_type': REGRESSION,
'label_column': 'SalePrice',
'performance_val': 0.076} # Regression with mixed feature-types, skewed Y-values.
toyregres_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/toyRegression.zip',
def __init__(self, path: str, problem_type: str, scheduler_options=None, objective_func=None,
num_classes=None, low_memory=False, feature_types_metadata={}, kfolds=0,
stack_ensemble_levels=0, time_limit=None, verbosity=2):
self.path = path
self.problem_type = problem_type
self.feature_types_metadata = feature_types_metadata
self.verbosity = verbosity
if objective_func is not None:
self.objective_func = objective_func
elif self.problem_type == BINARY:
self.objective_func = accuracy
elif self.problem_type == MULTICLASS:
self.objective_func = accuracy
else:
self.objective_func = root_mean_squared_error
self.objective_func_expects_y_pred = scorer_expects_y_pred(scorer=self.objective_func)
logger.log(25, "AutoGluon will gauge predictive performance using evaluation metric: %s" % self.objective_func.name)
if not self.objective_func_expects_y_pred:
logger.log(25, "This metric expects predicted probabilities rather than predicted class labels, so you'll need to use predict_proba() instead of predict()")
logger.log(20, "To change this, specify the eval_metric argument of fit()")
self.num_classes = num_classes
self.feature_prune = False # will be set to True if feature-pruning is turned on.
self.low_memory = low_memory
self.bagged_mode = True if kfolds >= 2 else False
if self.params['y_range'] is None: # Infer default y-range
y_vals = train_dataset.dataset._data[train_dataset.label_index].asnumpy()
min_y = float(min(y_vals))
max_y = float(max(y_vals))
std_y = np.std(y_vals)
y_ext = self.params['y_range_extend']*std_y
if min_y >= 0: # infer y must be nonnegative
min_y = max(0, min_y-y_ext)
else:
min_y = min_y-y_ext
if max_y <= 0: # infer y must be non-positive
max_y = min(0, max_y+y_ext)
else:
max_y = max_y+y_ext
self.params['y_range'] = (min_y, max_y)
elif self.problem_type == BINARY:
self.num_classes = 2
self.num_net_outputs = 2
if self.params['layers'] is None: # Use default choices for MLP architecture
if self.problem_type == REGRESSION:
default_layer_sizes = [256, 128] # overall network will have 4 layers. Input layer, 256-unit hidden layer, 128-unit hidden layer, output layer.
elif self.problem_type == BINARY or self.problem_type == MULTICLASS:
default_sizes = [256, 128] # will be scaled adaptively
# base_size = max(1, min(self.num_net_outputs, 20)/2.0) # scale layer width based on number of classes
base_size = max(1, min(self.num_net_outputs, 100) / 50) # TODO: Updated because it improved model quality and made training far faster
default_layer_sizes = [defaultsize*base_size for defaultsize in default_sizes]
# TODO: This gets really large on 100K+ rows... It takes hours on gpu for nyc-albert: 78 float/int features which get expanded to 1734, it also overfits and maxes accuracy on epoch
# LGBM takes 120 seconds on 4 cpu's and gets far better accuracy
# Perhaps we should add an order of magnitude to the pre-req with -3, or else scale based on feature count instead of row count.
# layer_expansion_factor = np.log10(max(train_dataset.num_examples, 1000)) - 2 # scale layers based on num_training_examples
layer_expansion_factor = 1 # TODO: Hardcoded to 1 because it results in both better model quality and far faster training time
return score, 1
class RegressionCustomMetric(CustomMetric):
def _get_y_pred(self, approxes):
return np.array(approxes[0])
def evaluate(self, approxes, target, weight):
y_pred = self._get_y_pred(approxes=approxes)
score = self.metric(np.array(target), y_pred)
return score, 1
metric_classes_dict = {
BINARY: BinaryCustomMetric,
MULTICLASS: MulticlassCustomMetric,
REGRESSION: RegressionCustomMetric,
}
def construct_custom_catboost_metric(metric, is_higher_better, needs_pred_proba, problem_type):
if (metric.name == 'log_loss') and (problem_type == MULTICLASS) and needs_pred_proba:
return 'MultiClass'
if metric.name == 'accuracy':
return 'Accuracy'
metric_class = metric_classes_dict[problem_type]
return metric_class(metric=metric, is_higher_better=is_higher_better, needs_pred_proba=needs_pred_proba)
def get_pred_from_proba(y_pred_proba, problem_type=BINARY):
if problem_type == BINARY:
y_pred = [1 if pred >= 0.5 else 0 for pred in y_pred_proba]
elif problem_type == REGRESSION:
y_pred = y_pred_proba
else:
y_pred = np.argmax(y_pred_proba, axis=1)
return y_pred
oracle_pred_proba_norm = [pred * weight for pred, weight in zip(pred_probas, oracle_weights)]
oracle_pred_proba_ensemble = np.sum(oracle_pred_proba_norm, axis=0)
if (trainer.problem_type == BINARY) and (self.problem_type == MULTICLASS):
oracle_pred_proba_ensemble = self.label_cleaner.inverse_transform_proba(oracle_pred_proba_ensemble)
if trainer.objective_func_expects_y_pred:
oracle_pred_ensemble = get_pred_from_proba(y_pred_proba=oracle_pred_proba_ensemble, problem_type=self.problem_type)
scores['oracle_ensemble_l' + str(level+1)] = self.objective_func(y, oracle_pred_ensemble)
else:
scores['oracle_ensemble_l' + str(level+1)] = self.objective_func(y, oracle_pred_proba_ensemble)
model_names_aux = trainer.models_level_auxiliary[level]
if len(model_names_aux) > 0:
pred_probas_auxiliary = self.get_pred_probas_models(X=X_stack, trainer=trainer, model_names=model_names_aux)
for i, model_name in enumerate(model_names_aux):
pred_proba = pred_probas_auxiliary[i]
if (trainer.problem_type == BINARY) and (self.problem_type == MULTICLASS):
pred_proba = self.label_cleaner.inverse_transform_proba(pred_proba)
if trainer.objective_func_expects_y_pred:
pred = get_pred_from_proba(y_pred_proba=pred_proba, problem_type=self.problem_type)
scores[model_name] = self.objective_func(y, pred)
else:
scores[model_name] = self.objective_func(y, pred_proba)
logger.debug('Model scores:')
logger.debug(str(scores))
model_names = []
scores_test = []
for model in scores.keys():
model_names.append(model)
scores_test.append(scores[model])
df = pd.DataFrame(data={
'model': model_names,
logger.warning("Warning: Ignoring %s (out of %s) training examples for which the label value in column '%s' is missing" % (len(missinglabel_inds),n, self.label))
X = X.drop(missinglabel_inds, axis=0)
if self.problem_type is None:
self.problem_type = self.get_problem_type(X[self.label])
# Gets labels prior to removal of infrequent classes
y_uncleaned = X[self.label].copy() # .astype('category').cat.categories
self.cleaner = Cleaner.construct(problem_type=self.problem_type, label=self.label, threshold=self.threshold)
# TODO: Most models crash if it is a multiclass problem with only two labels after thresholding, switch to being binary if this happens. Convert output from trainer to multiclass output preds in learner
# TODO: What if all classes in X are low frequency in multiclass? Currently we would crash. Not certain how many problems actually have this property
X = self.cleaner.fit_transform(X) # TODO: Consider merging cleaner into label_cleaner
self.label_cleaner = LabelCleaner.construct(problem_type=self.problem_type, y=X[self.label], y_uncleaned=y_uncleaned)
if (self.label_cleaner.num_classes is not None) and (self.label_cleaner.num_classes == 2):
self.trainer_problem_type = BINARY
else:
self.trainer_problem_type = self.problem_type
X, y = self.extract_label(X)
y = self.label_cleaner.transform(y)
if X_test is not None and self.label in X_test.columns:
X_test = self.cleaner.transform(X_test)
if len(X_test) == 0:
logger.debug('All X_test data contained low frequency classes, ignoring X_test and generating from subset of X')
X_test = None
y_test = None
else:
X_test, y_test = self.extract_label(X_test)
y_test = self.label_cleaner.transform(y_test)
else:
def construct(problem_type: str, y: Series, y_uncleaned: Series):
if problem_type == BINARY:
return LabelCleanerBinary(y)
elif problem_type == MULTICLASS:
return LabelCleanerMulticlass(y, y_uncleaned)
elif problem_type == REGRESSION:
return LabelCleanerDummy()
else:
raise NotImplementedError
def get_preset_models(path, problem_type, objective_func, num_classes=None,
hyperparameters={'NN':{},'GBM':{}}, hyperparameter_tune=False):
if problem_type in [BINARY, MULTICLASS]:
return get_preset_models_classification(path=path, problem_type=problem_type,
objective_func=objective_func, num_classes=num_classes,
hyperparameters=hyperparameters, hyperparameter_tune=hyperparameter_tune)
elif problem_type == REGRESSION:
return get_preset_models_regression(path=path, problem_type=problem_type,
objective_func=objective_func, hyperparameters=hyperparameters, hyperparameter_tune=hyperparameter_tune)
else:
raise NotImplementedError