How to use the autogluon.utils.tabular.ml.constants.BINARY function in autogluon

To help you get started, we’ve selected a few autogluon examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github awslabs / autogluon / tests / unittests / test_tabularHPO.py View on Github external
hyperparameters = {'GBM': gbm_options, 'NN': nn_options}
    num_trials = 3
    time_limits = 30
###################################################################

# Each train/test dataset must be located in single directory with the given names.
train_file = 'train_data.csv'
test_file = 'test_data.csv'
seed_val = 0 # random seed
EPS = 1e-10

# Information about each dataset in benchmark is stored in dict.
# performance_val = expected performance on this dataset (lower = better),should update based on previously run benchmarks
binary_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/AdultIncomeBinaryClassification.zip',
                  'name': 'AdultIncomeBinaryClassification',
                  'problem_type': BINARY,
                  'label_column': 'class',
                  'performance_val': 0.129} # Mixed types of features.

multi_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/CoverTypeMulticlassClassification.zip',
                  'name': 'CoverTypeMulticlassClassification',
                  'problem_type': MULTICLASS,
                  'label_column': 'Cover_Type',
                  'performance_val': 0.032} # big dataset with 7 classes, all features are numeric. Runs SLOW.

regression_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/AmesHousingPriceRegression.zip',
                   'name': 'AmesHousingPriceRegression',
                  'problem_type': REGRESSION,
                  'label_column': 'SalePrice',
                  'performance_val': 0.076} # Regression with mixed feature-types, skewed Y-values.

toyregres_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/toyRegression.zip',
github awslabs / autogluon / autogluon / utils / tabular / ml / trainer / abstract_trainer.py View on Github external
def __init__(self, path: str, problem_type: str, scheduler_options=None, objective_func=None,
                 num_classes=None, low_memory=False, feature_types_metadata={}, kfolds=0, 
                 stack_ensemble_levels=0, time_limit=None, verbosity=2):
        self.path = path
        self.problem_type = problem_type
        self.feature_types_metadata = feature_types_metadata
        self.verbosity = verbosity
        if objective_func is not None:
            self.objective_func = objective_func
        elif self.problem_type == BINARY:
            self.objective_func = accuracy
        elif self.problem_type == MULTICLASS:
            self.objective_func = accuracy
        else:
            self.objective_func = root_mean_squared_error

        self.objective_func_expects_y_pred = scorer_expects_y_pred(scorer=self.objective_func)
        logger.log(25, "AutoGluon will gauge predictive performance using evaluation metric: %s" % self.objective_func.name)
        if not self.objective_func_expects_y_pred:
            logger.log(25, "This metric expects predicted probabilities rather than predicted class labels, so you'll need to use predict_proba() instead of predict()")

        logger.log(20, "To change this, specify the eval_metric argument of fit()")
        self.num_classes = num_classes
        self.feature_prune = False # will be set to True if feature-pruning is turned on.
        self.low_memory = low_memory
        self.bagged_mode = True if kfolds >= 2 else False
github awslabs / autogluon / autogluon / utils / tabular / ml / models / tabular_nn / tabular_nn_model.py View on Github external
if self.params['y_range'] is None: # Infer default y-range
                y_vals = train_dataset.dataset._data[train_dataset.label_index].asnumpy()
                min_y = float(min(y_vals))
                max_y = float(max(y_vals))
                std_y = np.std(y_vals)
                y_ext = self.params['y_range_extend']*std_y
                if min_y >= 0: # infer y must be nonnegative
                    min_y = max(0, min_y-y_ext)
                else:
                    min_y = min_y-y_ext
                if max_y <= 0: # infer y must be non-positive
                    max_y = min(0, max_y+y_ext)
                else:
                    max_y = max_y+y_ext
                self.params['y_range'] = (min_y, max_y)
        elif self.problem_type == BINARY:
            self.num_classes = 2
            self.num_net_outputs = 2
        
        if self.params['layers'] is None: # Use default choices for MLP architecture
            if self.problem_type == REGRESSION:
                default_layer_sizes = [256, 128] # overall network will have 4 layers. Input layer, 256-unit hidden layer, 128-unit hidden layer, output layer.
            elif self.problem_type == BINARY or self.problem_type == MULTICLASS:
                default_sizes = [256, 128] # will be scaled adaptively
                # base_size = max(1, min(self.num_net_outputs, 20)/2.0) # scale layer width based on number of classes
                base_size = max(1, min(self.num_net_outputs, 100) / 50)  # TODO: Updated because it improved model quality and made training far faster
                default_layer_sizes = [defaultsize*base_size for defaultsize in default_sizes]
            # TODO: This gets really large on 100K+ rows... It takes hours on gpu for nyc-albert: 78 float/int features which get expanded to 1734, it also overfits and maxes accuracy on epoch
            #  LGBM takes 120 seconds on 4 cpu's and gets far better accuracy
            #  Perhaps we should add an order of magnitude to the pre-req with -3, or else scale based on feature count instead of row count.
            # layer_expansion_factor = np.log10(max(train_dataset.num_examples, 1000)) - 2 # scale layers based on num_training_examples
            layer_expansion_factor = 1  # TODO: Hardcoded to 1 because it results in both better model quality and far faster training time
github awslabs / autogluon / autogluon / utils / tabular / ml / models / catboost / catboost_utils.py View on Github external
return score, 1


class RegressionCustomMetric(CustomMetric):
    def _get_y_pred(self, approxes):
        return np.array(approxes[0])

    def evaluate(self, approxes, target, weight):
        y_pred = self._get_y_pred(approxes=approxes)
        score = self.metric(np.array(target), y_pred)

        return score, 1


metric_classes_dict = {
    BINARY: BinaryCustomMetric,
    MULTICLASS: MulticlassCustomMetric,
    REGRESSION: RegressionCustomMetric,
}


def construct_custom_catboost_metric(metric, is_higher_better, needs_pred_proba, problem_type):
    if (metric.name == 'log_loss') and (problem_type == MULTICLASS) and needs_pred_proba:
        return 'MultiClass'
    if metric.name == 'accuracy':
        return 'Accuracy'
    metric_class = metric_classes_dict[problem_type]
    return metric_class(metric=metric, is_higher_better=is_higher_better, needs_pred_proba=needs_pred_proba)
github awslabs / autogluon / autogluon / utils / tabular / ml / utils.py View on Github external
def get_pred_from_proba(y_pred_proba, problem_type=BINARY):
    if problem_type == BINARY:
        y_pred = [1 if pred >= 0.5 else 0 for pred in y_pred_proba]
    elif problem_type == REGRESSION:
        y_pred = y_pred_proba
    else:
        y_pred = np.argmax(y_pred_proba, axis=1)
    return y_pred
github awslabs / autogluon / autogluon / utils / tabular / ml / learner / abstract_learner.py View on Github external
oracle_pred_proba_norm = [pred * weight for pred, weight in zip(pred_probas, oracle_weights)]
                oracle_pred_proba_ensemble = np.sum(oracle_pred_proba_norm, axis=0)
                if (trainer.problem_type == BINARY) and (self.problem_type == MULTICLASS):
                    oracle_pred_proba_ensemble = self.label_cleaner.inverse_transform_proba(oracle_pred_proba_ensemble)
                if trainer.objective_func_expects_y_pred:
                    oracle_pred_ensemble = get_pred_from_proba(y_pred_proba=oracle_pred_proba_ensemble, problem_type=self.problem_type)
                    scores['oracle_ensemble_l' + str(level+1)] = self.objective_func(y, oracle_pred_ensemble)
                else:
                    scores['oracle_ensemble_l' + str(level+1)] = self.objective_func(y, oracle_pred_proba_ensemble)

            model_names_aux = trainer.models_level_auxiliary[level]
            if len(model_names_aux) > 0:
                pred_probas_auxiliary = self.get_pred_probas_models(X=X_stack, trainer=trainer, model_names=model_names_aux)
                for i, model_name in enumerate(model_names_aux):
                    pred_proba = pred_probas_auxiliary[i]
                    if (trainer.problem_type == BINARY) and (self.problem_type == MULTICLASS):
                        pred_proba = self.label_cleaner.inverse_transform_proba(pred_proba)
                    if trainer.objective_func_expects_y_pred:
                        pred = get_pred_from_proba(y_pred_proba=pred_proba, problem_type=self.problem_type)
                        scores[model_name] = self.objective_func(y, pred)
                    else:
                        scores[model_name] = self.objective_func(y, pred_proba)

        logger.debug('Model scores:')
        logger.debug(str(scores))
        model_names = []
        scores_test = []
        for model in scores.keys():
            model_names.append(model)
            scores_test.append(scores[model])
        df = pd.DataFrame(data={
            'model': model_names,
github awslabs / autogluon / autogluon / utils / tabular / ml / learner / default_learner.py View on Github external
logger.warning("Warning: Ignoring %s (out of %s) training examples for which the label value in column '%s' is missing" % (len(missinglabel_inds),n, self.label))
        X = X.drop(missinglabel_inds, axis=0)

        if self.problem_type is None:
            self.problem_type = self.get_problem_type(X[self.label])

        # Gets labels prior to removal of infrequent classes
        y_uncleaned = X[self.label].copy()  # .astype('category').cat.categories

        self.cleaner = Cleaner.construct(problem_type=self.problem_type, label=self.label, threshold=self.threshold)
        # TODO: Most models crash if it is a multiclass problem with only two labels after thresholding, switch to being binary if this happens. Convert output from trainer to multiclass output preds in learner
        # TODO: What if all classes in X are low frequency in multiclass? Currently we would crash. Not certain how many problems actually have this property
        X = self.cleaner.fit_transform(X)  # TODO: Consider merging cleaner into label_cleaner
        self.label_cleaner = LabelCleaner.construct(problem_type=self.problem_type, y=X[self.label], y_uncleaned=y_uncleaned)
        if (self.label_cleaner.num_classes is not None) and (self.label_cleaner.num_classes == 2):
            self.trainer_problem_type = BINARY
        else:
            self.trainer_problem_type = self.problem_type

        X, y = self.extract_label(X)
        y = self.label_cleaner.transform(y)

        if X_test is not None and self.label in X_test.columns:
            X_test = self.cleaner.transform(X_test)
            if len(X_test) == 0:
                logger.debug('All X_test data contained low frequency classes, ignoring X_test and generating from subset of X')
                X_test = None
                y_test = None
            else:
                X_test, y_test = self.extract_label(X_test)
                y_test = self.label_cleaner.transform(y_test)
        else:
github awslabs / autogluon / autogluon / utils / tabular / data / label_cleaner.py View on Github external
def construct(problem_type: str, y: Series, y_uncleaned: Series):
        if problem_type == BINARY:
            return LabelCleanerBinary(y)
        elif problem_type == MULTICLASS:
            return LabelCleanerMulticlass(y, y_uncleaned)
        elif problem_type == REGRESSION:
            return LabelCleanerDummy()
        else:
            raise NotImplementedError
github awslabs / autogluon / autogluon / utils / tabular / ml / trainer / model_presets / presets.py View on Github external
def get_preset_models(path, problem_type, objective_func, num_classes=None,
                      hyperparameters={'NN':{},'GBM':{}}, hyperparameter_tune=False):
    if problem_type in [BINARY, MULTICLASS]:
        return get_preset_models_classification(path=path, problem_type=problem_type,
                    objective_func=objective_func, num_classes=num_classes,
                    hyperparameters=hyperparameters, hyperparameter_tune=hyperparameter_tune)
    elif problem_type == REGRESSION:
        return get_preset_models_regression(path=path, problem_type=problem_type,
                    objective_func=objective_func, hyperparameters=hyperparameters, hyperparameter_tune=hyperparameter_tune)
    else:
        raise NotImplementedError