How to use the autogluon.utils.tabular.ml.constants.REGRESSION function in autogluon

To help you get started, we’ve selected a few autogluon examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github awslabs / autogluon / tests / unittests / test_tabular.py View on Github external
# performance_val = expected performance on this dataset (lower = better),should update based on previously run benchmarks
binary_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/AdultIncomeBinaryClassification.zip',
                  'name': 'AdultIncomeBinaryClassification',
                  'problem_type': BINARY,
                  'label_column': 'class',
                  'performance_val': 0.129} # Mixed types of features.

multi_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/CoverTypeMulticlassClassification.zip',
                  'name': 'CoverTypeMulticlassClassification',
                  'problem_type': MULTICLASS,
                  'label_column': 'Cover_Type',
                  'performance_val': 0.032} # big dataset with 7 classes, all features are numeric. Runs SLOW.

regression_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/AmesHousingPriceRegression.zip',
                   'name': 'AmesHousingPriceRegression',
                  'problem_type': REGRESSION,
                  'label_column': 'SalePrice',
                  'performance_val': 0.076} # Regression with mixed feature-types, skewed Y-values.

toyregres_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/toyRegression.zip', 
                     'name': 'toyRegression',
                     'problem_type': REGRESSION, 
                    'label_column': 'y', 
                    'performance_val': 0.183}
# 1-D toy deterministic regression task with: heavy label+feature missingness, extra distraction column in test data

toyclassif_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/toyClassification.zip',
                     'name': 'toyClassification',
                     'problem_type': MULTICLASS, 
                    'label_column': 'y', 
                    'performance_val': 0.436}
# 2-D toy noisy, imbalanced 4-class classification task with: feature missingness, out-of-vocabulary feature categories in test data, out-of-vocabulary labels in test data, training column missing from test data, extra distraction columns in test data
github awslabs / autogluon / autogluon / utils / tabular / ml / learner / abstract_learner.py View on Github external
except:
                    problem_type = REGRESSION
                    reason = "dtype of label-column == float and label-values can't be converted to int"
            else:
                problem_type = REGRESSION
                reason = "dtype of label-column == float and many unique label-values observed"
        elif unique_vals.dtype == 'object':
            problem_type = MULTICLASS
            reason = "dtype of label-column == object"
        elif unique_vals.dtype == 'int':
            unique_ratio = len(unique_vals)/float(len(y))
            if (unique_ratio <= REGRESS_THRESHOLD) and (unique_count <= MULTICLASS_LIMIT):
                problem_type = MULTICLASS  # TODO: Check if integers are from 0 to n-1 for n unique values, if they have a wide spread, it could still be regression
                reason = "dtype of label-column == int, but few unique label-values observed"
            else:
                problem_type = REGRESSION
                reason = "dtype of label-column == int and many unique label-values observed"
        else:
            raise NotImplementedError('label dtype', unique_vals.dtype, 'not supported!')
        logger.log(25, "AutoGluon infers your prediction problem is: %s  (because %s)" % (problem_type, reason))
        logger.log(25, "If this is wrong, please specify `problem_type` argument in fit() instead (You may specify problem_type as one of: ['%s', '%s', '%s'])\n" % (BINARY, MULTICLASS, REGRESSION))
        return problem_type
github awslabs / autogluon / autogluon / utils / tabular / data / label_cleaner.py View on Github external
def construct(problem_type: str, y: Series, y_uncleaned: Series):
        if problem_type == BINARY:
            return LabelCleanerBinary(y)
        elif problem_type == MULTICLASS:
            return LabelCleanerMulticlass(y, y_uncleaned)
        elif problem_type == REGRESSION:
            return LabelCleanerDummy()
        else:
            raise NotImplementedError
github awslabs / autogluon / autogluon / utils / tabular / ml / models / tabular_nn / tabular_nn_dataset.py View on Github external
feature_colind = feature_arraycol_map[feature]
                    data_list.append(mx.nd.array(processed_array[:,feature_colind], dtype='int32')) # array of ints with data for this embedding feature 
                    self.data_desc.append("embed")
                    self.feature_dataindex_map[feature]  = len(data_list)-1

        if len(self.feature_groups['language']) > 0:
            for feature in feature_type_map:
                if feature_type_map[feature] == 'language':
                    feature_colinds = feature_arraycol_map[feature]
                    data_list.append(mx.nd.array(processed_array[:,feature_colinds], dtype='int32')) # array of ints with data for this language feature 
                    self.data_desc.append("language")
                    self.feature_dataindex_map[feature]  = len(data_list)-1

        if labels is not None:
            labels = np.array(labels)
            if self.problem_type == REGRESSION and labels.dtype != np.float32:
                    labels = labels.astype('float32') # Convert to proper float-type if not already
            data_list.append(mx.nd.array(labels.reshape(len(labels),1)))
            self.data_desc.append("label")
            self.label_index = len(data_list) - 1 # To access data labels, use: self.dataset._data[self.label_index]
            self.num_classes = None
            if self.problem_type in [BINARY, MULTICLASS]:
                self.num_classes = len(set(labels))
        
        self.embed_indices = [i for i in range(len(self.data_desc)) if 'embed' in self.data_desc[i]] # list of indices of embedding features in self.dataset, order matters!
        self.language_indices = [i for i in range(len(self.data_desc)) if 'language' in self.data_desc[i]]  # list of indices of language features in self.dataset, order matters!
        self.num_categories_per_embed_feature = None
        self.dataset = mx.gluon.data.dataset.ArrayDataset(*data_list) # Access ith embedding-feature via: self.dataset._data[self.data_desc.index('embed_'+str(i))].asnumpy()
        self.dataloader = mx.gluon.data.DataLoader(self.dataset, self.batch_size, shuffle= not is_test,
                                last_batch = 'keep' if is_test else 'rollover',
                                num_workers=self.params['num_dataloading_workers']) # no need to shuffle test data
        if not is_test:
github awslabs / autogluon / autogluon / utils / tabular / ml / models / abstract / abstract_model.py View on Github external
def predict_proba(self, X, preprocess=True):
        if preprocess:
            X = self.preprocess(X)
        if self.problem_type == REGRESSION:
            return self.model.predict(X)

        y_pred_proba = self.model.predict_proba(X)

        if self.problem_type == BINARY:
            if len(y_pred_proba.shape) == 1:
                return y_pred_proba
            elif y_pred_proba.shape[1] > 1:
                return y_pred_proba[:, 1]
            else:
                return y_pred_proba
        elif y_pred_proba.shape[1] > 2:
            return y_pred_proba
        else:
            return y_pred_proba[:, 1]
github awslabs / autogluon / autogluon / utils / tabular / ml / models / tabular_nn / tabular_nn_model.py View on Github external
preds = nd.zeros((new_data.num_examples,1))
        else:
            preds = nd.zeros((new_data.num_examples, self.num_net_outputs))
        i = 0
        for batch_idx, data_batch in enumerate(new_data.dataloader):
            data_batch = new_data.format_batch_data(data_batch, self.ctx)
            preds_batch = self.model(data_batch)
            batch_size = len(preds_batch)
            if self.problem_type != REGRESSION: 
                if not predict_proba: # need to take argmax
                    preds_batch = nd.argmax(preds_batch, axis=1, keepdims=True)
                else: # need to take softmax
                    preds_batch = nd.softmax(preds_batch, axis=1)
            preds[i:(i+batch_size)] = preds_batch
            i = i+batch_size
        if self.problem_type == REGRESSION or not predict_proba:
            return preds.asnumpy().flatten() # return 1D numpy array
        elif self.problem_type == BINARY and predict_proba:
            return preds[:,1].asnumpy() # for binary problems, only return P(Y==1)
        return preds.asnumpy() # return 2D numpy array
github awslabs / autogluon / autogluon / utils / tabular / data / cleaner.py View on Github external
def construct(problem_type: str, label: str, threshold: int):
        if problem_type == BINARY:
            return CleanerDummy()
        elif problem_type == MULTICLASS:
            return CleanerMulticlass(label=label, threshold=threshold)
        elif problem_type == REGRESSION:
            return CleanerDummy()
        else:
            raise NotImplementedError
github awslabs / autogluon / autogluon / utils / tabular / ml / learner / abstract_learner.py View on Github external
if len(unique_vals) == 2:
            problem_type = BINARY
            reason = "only two unique label-values observed"
        elif unique_vals.dtype == 'float':
            unique_ratio = len(unique_vals) / float(len(y))
            if (unique_ratio <= REGRESS_THRESHOLD) and (unique_count <= MULTICLASS_LIMIT):
                try:
                    can_convert_to_int = np.array_equal(y, y.astype(int))
                    if can_convert_to_int:
                        problem_type = MULTICLASS
                        reason = "dtype of label-column == float, but few unique label-values observed and label-values can be converted to int"
                    else:
                        problem_type = REGRESSION
                        reason = "dtype of label-column == float and label-values can't be converted to int"
                except:
                    problem_type = REGRESSION
                    reason = "dtype of label-column == float and label-values can't be converted to int"
            else:
                problem_type = REGRESSION
                reason = "dtype of label-column == float and many unique label-values observed"
        elif unique_vals.dtype == 'object':
            problem_type = MULTICLASS
            reason = "dtype of label-column == object"
        elif unique_vals.dtype == 'int':
            unique_ratio = len(unique_vals)/float(len(y))
            if (unique_ratio <= REGRESS_THRESHOLD) and (unique_count <= MULTICLASS_LIMIT):
                problem_type = MULTICLASS  # TODO: Check if integers are from 0 to n-1 for n unique values, if they have a wide spread, it could still be regression
                reason = "dtype of label-column == int, but few unique label-values observed"
            else:
                problem_type = REGRESSION
                reason = "dtype of label-column == int and many unique label-values observed"
        else:
github awslabs / autogluon / autogluon / utils / tabular / ml / learner / abstract_learner.py View on Github external
def evaluate(self, y_true, y_pred, silent=False, auxiliary_metrics=False, detailed_report=True, high_always_good=False):
        """ Evaluate predictions. 
            Args:
                silent (bool): Should we print which metric is being used as well as performance.
                auxiliary_metrics (bool): Should we compute other (problem_type specific) metrics in addition to the default metric?
                detailed_report (bool): Should we computed more-detailed versions of the auxiliary_metrics? (requires auxiliary_metrics=True).
                high_always_good (bool): If True, this means higher values of returned metric are ALWAYS superior (so metrics like MSE should be returned negated)
            
            Returns single performance-value if auxiliary_metrics=False.
            Otherwise returns dict where keys = metrics, values = performance along each metric.
        """
        
        # Remove missing labels and produce warning if any are found:
        if self.problem_type == REGRESSION:
            missing_indicators = [(y is None or np.isnan(y)) for y in y_true]
        else:
            missing_indicators = [(y is None or y=='') for y in y_true]
        missing_inds = [i for i,j in enumerate(missing_indicators) if j]
        if len(missing_inds) > 0:
            nonmissing_inds = [i for i,j in enumerate(missing_indicators) if j]
            y_true = y_true[nonmissing_inds]
            y_pred = y_pred[nonmissing_inds]
            warnings.warn("There are %s (out of %s) evaluation datapoints for which the label is missing. " 
                          "AutoGluon removed these points from the evaluation, which thus may not be entirely representative. " 
                          "You should carefully study why there are missing labels in your evaluation data." % (len(missing_inds),len(y_true)))
        
        perf = self.objective_func(y_true, y_pred)
        metric = self.objective_func.name
        if not high_always_good:
            sign = self.objective_func._sign
github awslabs / autogluon / autogluon / utils / tabular / ml / models / lgb / lgb_model.py View on Github external
def predict_proba(self, X, preprocess=True):
        if preprocess:
            X = self.preprocess(X)
        if self.problem_type == REGRESSION:
            return self.model.predict(X)

        y_pred_proba = self.model.predict(X)
        if (self.problem_type == BINARY):
            if len(y_pred_proba.shape) == 1:
                return y_pred_proba
            elif y_pred_proba.shape[1] > 1:
                return y_pred_proba[:, 1]
            else:
                return y_pred_proba
        elif self.problem_type == MULTICLASS:
            return y_pred_proba
        else:
            if len(y_pred_proba.shape) == 1:
                return y_pred_proba
            elif y_pred_proba.shape[1] > 2:  # Should this ever happen?