Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_openml_param_name_to_sklearn(self):
scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
boosting = sklearn.ensemble.AdaBoostClassifier(
base_estimator=sklearn.tree.DecisionTreeClassifier())
model = sklearn.pipeline.Pipeline(steps=[
('scaler', scaler), ('boosting', boosting)])
flow = self.extension.model_to_flow(model)
task = openml.tasks.get_task(115)
run = openml.runs.run_flow_on_task(flow, task)
run = run.publish()
TestBase._mark_entity_for_removal('run', run.run_id)
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], run.run_id))
run = openml.runs.get_run(run.run_id)
setup = openml.setups.get_setup(run.setup_id)
# make sure to test enough parameters
self.assertGreater(len(setup.parameters), 15)
for parameter in setup.parameters.values():
sklearn_name = self.extension._openml_param_name_to_sklearn(parameter, flow)
# test the inverse. Currently, OpenML stores the hyperparameter
# fullName as flow.name + flow.version + parameter.name on the
# server (but this behaviour is not documented and might or might
def test_run_model_on_fold_classification_2(self):
task = openml.tasks.get_task(7)
X, y = task.get_X_and_y()
train_indices, test_indices = task.get_train_test_split_indices(
repeat=0, fold=0, sample=0)
X_train = X[train_indices]
y_train = y[train_indices]
X_test = X[test_indices]
y_test = y[test_indices]
pipeline = sklearn.model_selection.GridSearchCV(
sklearn.tree.DecisionTreeClassifier(),
{
"max_depth": [1, 2],
},
)
# TODO add some mocking here to actually test the innards of this function, too!
]
def _remove_random_state(flow):
if 'random_state' in flow.parameters:
del flow.parameters['random_state']
for component in flow.components.values():
_remove_random_state(component)
flow = self.extension.model_to_flow(clf)
flow, _ = self._add_sentinel_to_flow_name(flow, sentinel)
if not openml.flows.flow_exists(flow.name, flow.external_version):
flow.publish()
TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
TestBase.logger.info("collected from test_run_functions: {}".format(flow.flow_id))
task = openml.tasks.get_task(task_id)
X, y = task.get_X_and_y()
self.assertEqual(np.count_nonzero(np.isnan(X)), n_missing_vals)
run = openml.runs.run_flow_on_task(
flow=flow,
task=task,
seed=seed,
avoid_duplicate_runs=openml.config.avoid_duplicate_runs,
)
run_ = run.publish()
TestBase._mark_entity_for_removal('run', run.run_id)
TestBase.logger.info("collected from test_run_functions: {}".format(run.run_id))
self.assertEqual(run_, run)
self.assertIsInstance(run.dataset_id, int)
# This is only a smoke check right now
def test__get_task(self):
openml.config.cache_directory = self.static_cache_dir
openml.tasks.get_task(1882)
def get_dataset_splits(self, task_id):
""" Get the train/test splits for the given task
# Arguments
task_id: Id of OpenML task flow
# Returns
Train/Test datasets in numpy array format
"""
task = openml.tasks.get_task(task_id)
train_indices, test_indices = task.get_train_test_split_indices()
dataset = task.get_dataset()
X, y, categorical_indicator, attribute_names = dataset.get_data(
target=task.target_name, dataset_format='array')
x_train, y_train = X[train_indices], y[train_indices]
x_test, y_test = X[test_indices], y[test_indices]
return x_train, y_train, x_test, y_test
n_iterations=10,
preprocess=False,
train_test_splits = None,
rerfs=['binnedBaseRerF'],
rerfs_kwargs=[None],
rerf_param_keyword='trees',
sklearns=['RandomForest'],
sklearns_kwargs=[None],
sklearn_param_keyword='n_estimators',
param_values = range(20, 41, 20),
return_predictions = False,
verbose=True,
acorn=None
):
task = openml.tasks.get_task(oml_task_id)
X, y = task.get_X_and_y()
if len(rerfs) > len(rerfs_kwargs):
if len(rerfs_kwargs) == 1:
rerfs_kwargs = [rerfs_kwargs[0] for model in rerfs]
else:
raise ValueError('bad rerfs_kwargs')
if len(sklearns) > len(sklearns_kwargs):
if len(sklearns_kwargs) == 1:
sklearns_kwargs = [sklearns_kwargs[0] for model in sklearns]
else:
raise ValueError('bad sklearns_kwargs')
if preprocess:
# TODO
############################################################################
# Exercise
# ########
#
# Search for the tasks on the 'eeg-eye-state' dataset.
tasks.query('name=="eeg-eye-state"')
############################################################################
# Downloading tasks
# ^^^^^^^^^^^^^^^^^
#
# We provide two functions to download tasks, one which downloads only a single task by its ID, and one which takes a list of IDs and downloads all of these tasks:
task_id = 1
task = openml.tasks.get_task(task_id)
############################################################################
# Properties of the task are stored as member variables:
pprint(vars(task))
############################################################################
# And:
ids = [1, 2, 19, 97, 403]
tasks = openml.tasks.get_tasks(ids)
pprint(tasks[0])
:param benchmark: benchmark name containing allowed resources, e.g. 'medium-8c4h'
:param framework: framework name
:param task_id: openml task id
:param predictions: mapping for fold->predictions file
:return: an OpenML run connected between the right task and flow, and associated predictions.
"""
cores, memory, time = parse_resource_parameters(benchmark)
flow_id = amlb_flows[framework]
parameters = [
OrderedDict([('oml:name', 'cores'), ('oml:value', cores), ('oml:component', flow_id)]),
OrderedDict([('oml:name', 'memory'), ('oml:value', memory), ('oml:component', flow_id)]),
OrderedDict([('oml:name', 'time'), ('oml:value', time), ('oml:component', flow_id)]),
]
task = openml.tasks.get_task(task_id)
dataset_id = task.get_dataset().dataset_id
benchmark_command = f'python3 runbenchmark.py {framework} {benchmark} -m aws -t {task_id}'
predictions = load_format_predictions(task_id, predictions)
return openml.runs.OpenMLRun(
task_id=task_id, flow_id=flow_id, dataset_id=dataset_id,
parameter_settings=parameters,
setup_string=benchmark_command,
data_content=predictions,
tags=['study_218']
)