Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# performance_val = expected performance on this dataset (lower = better),should update based on previously run benchmarks
binary_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/AdultIncomeBinaryClassification.zip',
'name': 'AdultIncomeBinaryClassification',
'problem_type': BINARY,
'label_column': 'class',
'performance_val': 0.129} # Mixed types of features.
multi_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/CoverTypeMulticlassClassification.zip',
'name': 'CoverTypeMulticlassClassification',
'problem_type': MULTICLASS,
'label_column': 'Cover_Type',
'performance_val': 0.032} # big dataset with 7 classes, all features are numeric. Runs SLOW.
regression_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/AmesHousingPriceRegression.zip',
'name': 'AmesHousingPriceRegression',
'problem_type': REGRESSION,
'label_column': 'SalePrice',
'performance_val': 0.076} # Regression with mixed feature-types, skewed Y-values.
toyregres_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/toyRegression.zip',
'name': 'toyRegression',
'problem_type': REGRESSION,
'label_column': 'y',
'performance_val': 0.183}
# 1-D toy deterministic regression task with: heavy label+feature missingness, extra distraction column in test data
toyclassif_dataset = {'url': 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/toyClassification.zip',
'name': 'toyClassification',
'problem_type': MULTICLASS,
'label_column': 'y',
'performance_val': 0.436}
# 2-D toy noisy, imbalanced 4-class classification task with: feature missingness, out-of-vocabulary feature categories in test data, out-of-vocabulary labels in test data, training column missing from test data, extra distraction columns in test data
except:
problem_type = REGRESSION
reason = "dtype of label-column == float and label-values can't be converted to int"
else:
problem_type = REGRESSION
reason = "dtype of label-column == float and many unique label-values observed"
elif unique_vals.dtype == 'object':
problem_type = MULTICLASS
reason = "dtype of label-column == object"
elif unique_vals.dtype == 'int':
unique_ratio = len(unique_vals)/float(len(y))
if (unique_ratio <= REGRESS_THRESHOLD) and (unique_count <= MULTICLASS_LIMIT):
problem_type = MULTICLASS # TODO: Check if integers are from 0 to n-1 for n unique values, if they have a wide spread, it could still be regression
reason = "dtype of label-column == int, but few unique label-values observed"
else:
problem_type = REGRESSION
reason = "dtype of label-column == int and many unique label-values observed"
else:
raise NotImplementedError('label dtype', unique_vals.dtype, 'not supported!')
logger.log(25, "AutoGluon infers your prediction problem is: %s (because %s)" % (problem_type, reason))
logger.log(25, "If this is wrong, please specify `problem_type` argument in fit() instead (You may specify problem_type as one of: ['%s', '%s', '%s'])\n" % (BINARY, MULTICLASS, REGRESSION))
return problem_type
def construct(problem_type: str, y: Series, y_uncleaned: Series):
if problem_type == BINARY:
return LabelCleanerBinary(y)
elif problem_type == MULTICLASS:
return LabelCleanerMulticlass(y, y_uncleaned)
elif problem_type == REGRESSION:
return LabelCleanerDummy()
else:
raise NotImplementedError
feature_colind = feature_arraycol_map[feature]
data_list.append(mx.nd.array(processed_array[:,feature_colind], dtype='int32')) # array of ints with data for this embedding feature
self.data_desc.append("embed")
self.feature_dataindex_map[feature] = len(data_list)-1
if len(self.feature_groups['language']) > 0:
for feature in feature_type_map:
if feature_type_map[feature] == 'language':
feature_colinds = feature_arraycol_map[feature]
data_list.append(mx.nd.array(processed_array[:,feature_colinds], dtype='int32')) # array of ints with data for this language feature
self.data_desc.append("language")
self.feature_dataindex_map[feature] = len(data_list)-1
if labels is not None:
labels = np.array(labels)
if self.problem_type == REGRESSION and labels.dtype != np.float32:
labels = labels.astype('float32') # Convert to proper float-type if not already
data_list.append(mx.nd.array(labels.reshape(len(labels),1)))
self.data_desc.append("label")
self.label_index = len(data_list) - 1 # To access data labels, use: self.dataset._data[self.label_index]
self.num_classes = None
if self.problem_type in [BINARY, MULTICLASS]:
self.num_classes = len(set(labels))
self.embed_indices = [i for i in range(len(self.data_desc)) if 'embed' in self.data_desc[i]] # list of indices of embedding features in self.dataset, order matters!
self.language_indices = [i for i in range(len(self.data_desc)) if 'language' in self.data_desc[i]] # list of indices of language features in self.dataset, order matters!
self.num_categories_per_embed_feature = None
self.dataset = mx.gluon.data.dataset.ArrayDataset(*data_list) # Access ith embedding-feature via: self.dataset._data[self.data_desc.index('embed_'+str(i))].asnumpy()
self.dataloader = mx.gluon.data.DataLoader(self.dataset, self.batch_size, shuffle= not is_test,
last_batch = 'keep' if is_test else 'rollover',
num_workers=self.params['num_dataloading_workers']) # no need to shuffle test data
if not is_test:
def predict_proba(self, X, preprocess=True):
if preprocess:
X = self.preprocess(X)
if self.problem_type == REGRESSION:
return self.model.predict(X)
y_pred_proba = self.model.predict_proba(X)
if self.problem_type == BINARY:
if len(y_pred_proba.shape) == 1:
return y_pred_proba
elif y_pred_proba.shape[1] > 1:
return y_pred_proba[:, 1]
else:
return y_pred_proba
elif y_pred_proba.shape[1] > 2:
return y_pred_proba
else:
return y_pred_proba[:, 1]
preds = nd.zeros((new_data.num_examples,1))
else:
preds = nd.zeros((new_data.num_examples, self.num_net_outputs))
i = 0
for batch_idx, data_batch in enumerate(new_data.dataloader):
data_batch = new_data.format_batch_data(data_batch, self.ctx)
preds_batch = self.model(data_batch)
batch_size = len(preds_batch)
if self.problem_type != REGRESSION:
if not predict_proba: # need to take argmax
preds_batch = nd.argmax(preds_batch, axis=1, keepdims=True)
else: # need to take softmax
preds_batch = nd.softmax(preds_batch, axis=1)
preds[i:(i+batch_size)] = preds_batch
i = i+batch_size
if self.problem_type == REGRESSION or not predict_proba:
return preds.asnumpy().flatten() # return 1D numpy array
elif self.problem_type == BINARY and predict_proba:
return preds[:,1].asnumpy() # for binary problems, only return P(Y==1)
return preds.asnumpy() # return 2D numpy array
def construct(problem_type: str, label: str, threshold: int):
if problem_type == BINARY:
return CleanerDummy()
elif problem_type == MULTICLASS:
return CleanerMulticlass(label=label, threshold=threshold)
elif problem_type == REGRESSION:
return CleanerDummy()
else:
raise NotImplementedError
if len(unique_vals) == 2:
problem_type = BINARY
reason = "only two unique label-values observed"
elif unique_vals.dtype == 'float':
unique_ratio = len(unique_vals) / float(len(y))
if (unique_ratio <= REGRESS_THRESHOLD) and (unique_count <= MULTICLASS_LIMIT):
try:
can_convert_to_int = np.array_equal(y, y.astype(int))
if can_convert_to_int:
problem_type = MULTICLASS
reason = "dtype of label-column == float, but few unique label-values observed and label-values can be converted to int"
else:
problem_type = REGRESSION
reason = "dtype of label-column == float and label-values can't be converted to int"
except:
problem_type = REGRESSION
reason = "dtype of label-column == float and label-values can't be converted to int"
else:
problem_type = REGRESSION
reason = "dtype of label-column == float and many unique label-values observed"
elif unique_vals.dtype == 'object':
problem_type = MULTICLASS
reason = "dtype of label-column == object"
elif unique_vals.dtype == 'int':
unique_ratio = len(unique_vals)/float(len(y))
if (unique_ratio <= REGRESS_THRESHOLD) and (unique_count <= MULTICLASS_LIMIT):
problem_type = MULTICLASS # TODO: Check if integers are from 0 to n-1 for n unique values, if they have a wide spread, it could still be regression
reason = "dtype of label-column == int, but few unique label-values observed"
else:
problem_type = REGRESSION
reason = "dtype of label-column == int and many unique label-values observed"
else:
def evaluate(self, y_true, y_pred, silent=False, auxiliary_metrics=False, detailed_report=True, high_always_good=False):
""" Evaluate predictions.
Args:
silent (bool): Should we print which metric is being used as well as performance.
auxiliary_metrics (bool): Should we compute other (problem_type specific) metrics in addition to the default metric?
detailed_report (bool): Should we computed more-detailed versions of the auxiliary_metrics? (requires auxiliary_metrics=True).
high_always_good (bool): If True, this means higher values of returned metric are ALWAYS superior (so metrics like MSE should be returned negated)
Returns single performance-value if auxiliary_metrics=False.
Otherwise returns dict where keys = metrics, values = performance along each metric.
"""
# Remove missing labels and produce warning if any are found:
if self.problem_type == REGRESSION:
missing_indicators = [(y is None or np.isnan(y)) for y in y_true]
else:
missing_indicators = [(y is None or y=='') for y in y_true]
missing_inds = [i for i,j in enumerate(missing_indicators) if j]
if len(missing_inds) > 0:
nonmissing_inds = [i for i,j in enumerate(missing_indicators) if j]
y_true = y_true[nonmissing_inds]
y_pred = y_pred[nonmissing_inds]
warnings.warn("There are %s (out of %s) evaluation datapoints for which the label is missing. "
"AutoGluon removed these points from the evaluation, which thus may not be entirely representative. "
"You should carefully study why there are missing labels in your evaluation data." % (len(missing_inds),len(y_true)))
perf = self.objective_func(y_true, y_pred)
metric = self.objective_func.name
if not high_always_good:
sign = self.objective_func._sign
def predict_proba(self, X, preprocess=True):
if preprocess:
X = self.preprocess(X)
if self.problem_type == REGRESSION:
return self.model.predict(X)
y_pred_proba = self.model.predict(X)
if (self.problem_type == BINARY):
if len(y_pred_proba.shape) == 1:
return y_pred_proba
elif y_pred_proba.shape[1] > 1:
return y_pred_proba[:, 1]
else:
return y_pred_proba
elif self.problem_type == MULTICLASS:
return y_pred_proba
else:
if len(y_pred_proba.shape) == 1:
return y_pred_proba
elif y_pred_proba.shape[1] > 2: # Should this ever happen?