Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
validation_frame = H2OFrame._validate(validation_frame, 'validation_frame')
self.input_spec['validation_frame'] = validation_frame.frame_id if validation_frame is not None else None
leaderboard_frame = H2OFrame._validate(leaderboard_frame, 'leaderboard_frame')
self.input_spec['leaderboard_frame'] = leaderboard_frame.frame_id if leaderboard_frame is not None else None
blending_frame = H2OFrame._validate(blending_frame, 'blending_frame')
self.input_spec['blending_frame'] = blending_frame.frame_id if blending_frame is not None else None
if x is not None:
assert_is_type(x, list)
xset = set()
if is_type(x, int, str): x = [x]
for xi in x:
if is_type(xi, int):
if not (-ncols <= xi < ncols):
raise H2OValueError("Column %d does not exist in the training frame" % xi)
xset.add(names[xi])
else:
if xi not in names:
raise H2OValueError("Column %s not in the training frame" % xi)
xset.add(xi)
ignored_columns = set(names) - xset
for col in [y, fold_column, weights_column]:
if col is not None and col in ignored_columns:
ignored_columns.remove(col)
if ignored_columns is not None:
self.input_spec['ignored_columns'] = list(ignored_columns)
def clean_params(params):
:param str metric: A metric among :const:`maximizing_metrics`.
:param thresholds: thresholds parameter must be a number or a list (i.e. [0.01, 0.5, 0.99]).
If None, then the threshold maximizing the metric will be used.
If 'all', then all stored thresholds are used and returned with the matching metric.
:returns: The set of metrics for the list of thresholds.
The returned list has a 'value' property holding only
the metric value (if no threshold provided or if provided as a number),
or all the metric values (if thresholds provided as a list)
"""
assert_is_type(thresholds, None, 'all', numeric, [numeric])
if metric not in H2OBinomialModelMetrics.maximizing_metrics:
raise ValueError("The only allowable metrics are {}".format(', '.join(H2OBinomialModelMetrics.maximizing_metrics)))
h2o_metric = (H2OBinomialModelMetrics.metrics_aliases[metric] if metric in H2OBinomialModelMetrics.metrics_aliases
else metric)
value_is_scalar = is_type(metric, str) and (thresholds is None or is_type(thresholds, numeric))
if thresholds is None:
thresholds = [self.find_threshold_by_max_metric(h2o_metric)]
elif thresholds == 'all':
thresholds = None
elif is_type(thresholds, numeric):
thresholds = [thresholds]
metrics = List()
thresh2d = self._metric_json['thresholds_and_metric_scores']
if thresholds is None: # fast path to return all thresholds: skipping find_idx logic
metrics.extend(list(t) for t in zip(thresh2d['threshold'], thresh2d[h2o_metric]))
else:
for t in thresholds:
idx = self.find_idx_by_threshold(t)
metrics.append([t, thresh2d[h2o_metric][idx]])
def __init__(self, model, hyper_params, grid_id=None, search_criteria=None):
super(H2OGridSearch, self).__init__()
assert_is_type(model, None, H2OEstimator, lambda mdl: issubclass(mdl, H2OEstimator))
assert_is_type(hyper_params, dict)
assert_is_type(grid_id, None, str)
assert_is_type(search_criteria, None, dict)
if not (model is None or is_type(model, H2OEstimator)): model = model()
self._id = grid_id
self.model = model
self.hyper_params = dict(hyper_params)
self.search_criteria = None if search_criteria is None else dict(search_criteria)
self._grid_json = None
self.models = None # list of H2O Estimator instances
self._parms = {} # internal, for object recycle #
self.parms = {} # external#
self._future = False # used by __repr__/show to query job state#
self._job = None # used when _future is True#
def _add_agg(self, op, col, na):
if op == "nrow": col = 0
if col is None:
for i in range(self._fr.ncol):
if i not in self._by: self._add_agg(op, i, na)
return self
elif is_type(col, str):
cidx = self._fr.names.index(col)
elif is_type(col, int):
cidx = col
elif is_type(col, list, tuple):
for i in col:
self._add_agg(op, i, na)
return self
else:
raise ValueError("col must be a column name or index.")
name = "{}_{}".format(op, self._fr.names[cidx])
self._aggs[name] = [op, cidx, na]
return self
tframe = algo_params["training_frame"]
if tframe is None: raise ValueError("Missing training_frame")
if y is not None:
if is_type(y, list, tuple):
if len(y) == 1:
parms["y"] = y[0]
else:
raise ValueError('y must be a single column reference')
if x is None:
if(isinstance(y, int)):
xset = set(range(training_frame.ncols)) - {y}
else:
xset = set(training_frame.names) - {y}
else:
xset = set()
if is_type(x, int, str): x = [x]
for xi in x:
if is_type(xi, int):
if not (-training_frame.ncols <= xi < training_frame.ncols):
raise H2OValueError("Column %d does not exist in the training frame" % xi)
xset.add(training_frame.names[xi])
else:
if xi not in training_frame.names:
raise H2OValueError("Column %s not in the training frame" % xi)
xset.add(xi)
x = list(xset)
parms["x"] = x
self.build_model(parms)
is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec"}
if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.")
if not is_unsupervised and y is None: raise ValueError("Missing response")
# Step 3
if not training_frame_exists:
parms["training_frame"] = training_frame
offset = parms["offset_column"]
folds = parms["fold_column"]
weights = parms["weights_column"]
if validation_frame is not None: parms["validation_frame"] = validation_frame
if is_type(y, int): y = training_frame.names[y]
if y is not None: parms["response_column"] = y
if not isinstance(x, (list, tuple)): x = [x]
if is_type(x[0], int):
x = [training_frame.names[i] for i in x]
if not training_frame_exists:
ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights]))
parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns]
parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else
[quoted(col) for col in parms["interactions"]])
parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None else
[tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]])
parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms}
rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3
model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms)
model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build"))
if self._future:
if x is not None:
raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously")
for ic in ignored_columns:
if is_type(ic, int):
if not (-ncols <= ic < ncols):
raise H2OValueError("Column %d does not exist in the training frame" % ic)
ignored_columns_set.add(names[ic])
else:
if ic not in names:
raise H2OValueError("Column %s not in the training frame" % ic)
ignored_columns_set.add(ic)
if x is None:
xset = set(names) - {y} - ignored_columns_set
else:
xset = set()
if is_type(x, int, str): x = [x]
for xi in x:
if is_type(xi, int):
if not (-ncols <= xi < ncols):
raise H2OValueError("Column %d does not exist in the training frame" % xi)
xset.add(names[xi])
else:
if xi not in names:
raise H2OValueError("Column %s not in the training frame" % xi)
xset.add(xi)
x = list(xset)
parms["offset_column"] = offset_column
parms["fold_column"] = fold_column
parms["weights_column"] = weights_column
parms["max_runtime_secs"] = max_runtime_secs
if y is not None:
if is_type(y, list, tuple):
if len(y) == 1:
parms["y"] = y[0]
else:
raise ValueError('y must be a single column reference')
if x is None:
if(isinstance(y, int)):
xset = set(range(training_frame.ncols)) - {y}
else:
xset = set(training_frame.names) - {y}
else:
xset = set()
if is_type(x, int, str): x = [x]
for xi in x:
if is_type(xi, int):
if not (-training_frame.ncols <= xi < training_frame.ncols):
raise H2OValueError("Column %d does not exist in the training frame" % xi)
xset.add(training_frame.names[xi])
else:
if xi not in training_frame.names:
raise H2OValueError("Column %s not in the training frame" % xi)
xset.add(xi)
x = list(xset)
parms["x"] = x
self.build_model(parms)
for ic in ignored_columns:
if is_type(ic, int):
if not (-ncols <= ic < ncols):
raise H2OValueError("Column %d does not exist in the training frame" % ic)
ignored_columns_set.add(names[ic])
else:
if ic not in names:
raise H2OValueError("Column %s not in the training frame" % ic)
ignored_columns_set.add(ic)
if x is None:
xset = set(names) - {y} - ignored_columns_set
else:
xset = set()
if is_type(x, int, str): x = [x]
for xi in x:
if is_type(xi, int):
if not (-ncols <= xi < ncols):
raise H2OValueError("Column %d does not exist in the training frame" % xi)
xset.add(names[xi])
else:
if xi not in names:
raise H2OValueError("Column %s not in the training frame" % xi)
xset.add(xi)
x = list(xset)
parms["offset_column"] = offset_column
parms["fold_column"] = fold_column
parms["weights_column"] = weights_column
parms["max_runtime_secs"] = max_runtime_secs
# Overwrites the model_id parameter only if model_id is passed
if model_id is not None:
def _add_agg(self, op, col, na):
if op == "nrow": col = 0
if col is None:
for i in range(self._fr.ncol):
if i not in self._by: self._add_agg(op, i, na)
return self
elif is_type(col, str):
cidx = self._fr.names.index(col)
elif is_type(col, int):
cidx = col
elif is_type(col, list, tuple):
for i in col:
self._add_agg(op, i, na)
return self
else:
raise ValueError("col must be a column name or index.")
name = "{}_{}".format(op, self._fr.names[cidx])
self._aggs[name] = [op, cidx, na]
return self