Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
natype = U(str, [str])
assert_is_type(path, str, [str])
assert_is_type(pattern, str, None)
assert_is_type(destination_frame, str, None)
assert_is_type(parse, bool)
assert_is_type(header, -1, 0, 1)
assert_is_type(sep, None, I(str, lambda s: len(s) == 1))
assert_is_type(col_names, [str], None)
assert_is_type(col_types, [coltype], {str: coltype}, None)
assert_is_type(na_strings, [natype], {str: natype}, None)
assert (skipped_columns==None) or isinstance(skipped_columns, list), \
"The skipped_columns should be an list of column names!"
check_frame_id(destination_frame)
patharr = path if isinstance(path, list) else [path]
if any(os.path.split(p)[0] == "~" for p in patharr):
raise H2OValueError("Paths relative to a current user (~) are not valid in the server environment. "
"Please use absolute paths if possible.")
if not parse:
return lazy_import(path, pattern)
else:
return H2OFrame()._import_parse(path, pattern, destination_frame, header, sep, col_names, col_types, na_strings, skipped_columns)
if x is None:
if(isinstance(y, int)):
xset = set(range(training_frame.ncols)) - {y}
else:
xset = set(training_frame.names) - {y}
else:
xset = set()
if is_type(x, int, str): x = [x]
for xi in x:
if is_type(xi, int):
if not (-training_frame.ncols <= xi < training_frame.ncols):
raise H2OValueError("Column %d does not exist in the training frame" % xi)
xset.add(training_frame.names[xi])
else:
if xi not in training_frame.names:
raise H2OValueError("Column %s not in the training frame" % xi)
xset.add(xi)
x = list(xset)
parms["x"] = x
self.build_model(parms)
def null_deviance(self, train=False, valid=False, xval=False):
"""
Retreive the null deviance if this model has the attribute, or None otherwise.
:param bool train: Get the null deviance for the training set. If both train and valid are False, then train
is selected by default.
:param bool valid: Get the null deviance for the validation set. If both train and valid are True, then train
is selected by default.
:returns: Return the null deviance, or None if it is not present.
"""
if xval: raise H2OValueError("Cross-validation metrics are not available.")
if not train and not valid: train = True
if train and valid: train = True
if train:
return self._model_json["output"]["training_metrics"].null_deviance()
else:
return self._model_json["output"]["validation_metrics"].null_deviance()
def _verify_training_frame_params(self, *args):
for param in args:
if param is not None:
raise H2OValueError("No training frame defined, yet the parameter %d is has been specified.", param)
assert_is_type(missing_fraction, t_fraction)
assert_is_type(binary_ones_fraction, t_fraction)
assert_is_type(factors, BoundInt(1))
assert_is_type(integer_range, BoundInt(1))
assert_is_type(has_response, bool)
assert_is_type(response_factors, None, BoundInt(1))
assert_is_type(positive_response, bool)
assert_is_type(seed, int, None)
assert_is_type(seed_for_column_types, int, None)
check_frame_id(frame_id)
if randomize and value:
raise H2OValueError("Cannot set data to a `value` if `randomize` is true")
if (categorical_fraction or integer_fraction) and not randomize:
raise H2OValueError("`randomize` should be True when either categorical or integer columns are used.")
# The total column fraction that the user has specified explicitly. This sum should not exceed 1. We will respect
# all explicitly set fractions, and will auto-select the remaining fractions.
frcs = [real_fraction, categorical_fraction, integer_fraction, binary_fraction, time_fraction, string_fraction]
wgts = [0.5, 0.2, 0.2, 0.1, 0.0, 0.0]
sum_explicit_fractions = sum(0 if f is None else f for f in frcs)
count_explicit_fractions = sum(0 if f is None else 1 for f in frcs)
remainder = 1 - sum_explicit_fractions
if sum_explicit_fractions >= 1 + 1e-10:
raise H2OValueError("Fractions of binary, integer, categorical, time and string columns should add up "
"to a number less than 1.")
elif sum_explicit_fractions >= 1 - 1e-10:
# The fractions already add up to almost 1. No need to do anything (the server will absorb the tiny
# remainder into the real_fraction column).
pass
else:
names_list = {"model_id", "training_frame", "validation_frame", "ignored_columns", "ignore_const_cols",
"score_each_iteration", "loading_name", "transform", "k", "loss", "loss_by_col",
"loss_by_col_idx", "multi_loss", "period", "regularization_x", "regularization_y", "gamma_x",
"gamma_y", "max_iterations", "max_updates", "init_step_size", "min_step_size", "seed", "init",
"svd_method", "user_y", "user_x", "expand_user_y", "impute_original", "recover_svd",
"max_runtime_secs"}
if "Lambda" in kwargs: kwargs["lambda_"] = kwargs.pop("Lambda")
for pname, pvalue in kwargs.items():
if pname == 'model_id':
self._id = pvalue
self._parms["model_id"] = pvalue
elif pname in names_list:
# Using setattr(...) will invoke type-checking of the arguments
setattr(self, pname, pvalue)
else:
raise H2OValueError("Unknown parameter %s = %r" % (pname, pvalue))
self._parms["_rest_version"] = 3
def __init__(self, **kwargs):
super(H2ORandomForestEstimator, self).__init__()
self._parms = {}
for pname, pvalue in kwargs.items():
if pname == 'model_id':
self._id = pvalue
self._parms["model_id"] = pvalue
elif pname in self.param_names:
# Using setattr(...) will invoke type-checking of the arguments
setattr(self, pname, pvalue)
else:
raise H2OValueError("Unknown parameter %s = %r" % (pname, pvalue))
>>> air_gbm.train(x=myX,
... y=myY,
... training_frame=train,
... validation_frame=valid)
>>> air_gbm.plot(type="roc", train=True, server=True)
>>> air_gbm.plot(type="roc", valid=True, server=True)
>>> perf = air_gbm.model_performance(valid)
>>> perf.plot(type="roc", server=True)
>>> perf.plot
"""
assert_is_type(metric, "AUTO", "logloss", "auc", "classification_error", "rmse", "objective",
"negative_log_likelihood")
if self._model_json["algo"] in ("deeplearning", "deepwater", "xgboost", "drf", "gbm"):
# make sure metric is not those of GLM metrics for other models
if metric in ("negative_log_likelihood", "objective"):
raise H2OValueError("Metrics: negative_log_likelihood, objective are only for glm models.")
if metric == "AUTO":
metric = "logloss"
self._plot(timestep=timestep, metric=metric, server=server)
def _plot(self, timestep, metric, server=False):
plt = _get_matplotlib_pyplot(server)
if not plt: return
scoring_history = self.scoring_history()
# Separate functionality for GLM since its output is different from other algos
if self._model_json["algo"] == "glm":
# GLM has only one timestep option, which is `iteration`
timestep = "iteration"
if metric == "AUTO":
metric = "log_likelihood"
elif metric not in ("log_likelihood", "objective"):
raise H2OValueError("for GLM, metric must be one of: log_likelihood, objective")
plt.xlabel(timestep)
plt.ylabel(metric)
plt.title("Validation Scoring History")
plt.plot(scoring_history[timestep], scoring_history[metric])
elif self._model_json["algo"] in ("deeplearning", "deepwater", "xgboost", "drf", "gbm"):
# Set timestep
if self._model_json["algo"] in ("gbm", "drf", "xgboost"):
assert_is_type(timestep, "AUTO", "duration", "number_of_trees")
if timestep == "AUTO":
timestep = "number_of_trees"
else: # self._model_json["algo"] == "deeplearning":
# Delete first row of DL scoring history since it contains NAs & NaNs
if scoring_history["samples"][0] == 0:
scoring_history = scoring_history[1:]
assert_is_type(timestep, "AUTO", "epochs", "samples", "duration")