Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_classification_workflow(self):
task = openml.tasks.get_task(254)
X, y = task.get_X_and_y()
ohe = OneHotEncoder(categorical_features=[True]*22)
tree = sklearn.tree.DecisionTreeClassifier(random_state=1)
pipeline = sklearn.pipeline.Pipeline((('ohe', ohe), ('tree', tree)))
X_train, X_test, y_train, y_test = \
sklearn.cross_validation.train_test_split(X, y, random_state=3,
train_size=0.5,
test_size=0.5)
pipeline.fit(X_train, y_train)
self.assertEqual(np.mean(y_train == pipeline.predict(X_train)), 1)
# With an incorrect copy operation the OneHotEncoder would rearrange
# the data in such a way that the accuracy would drop to 66%
self.assertEqual(np.mean(y_test == pipeline.predict(X_test)), 1)
def test_impute_with_constant(self):
task_ids = [2]
for task_id in task_ids:
task = openml.tasks.get_task(task_id)
dataset = task.get_dataset()
X, _ = dataset.get_data(target=task.target_name)
nominal_indices = dataset.get_features_by_type('nominal', exclude=[task.target_name])
fill_empty = -1
clf = ConditionalImputer(strategy="median",
strategy_nominal="most_frequent",
categorical_features=None,
verbose=True,
fill_empty=fill_empty)
self._do_test(dataset, X, nominal_indices, clf, fill_empty=fill_empty)
def test__get_estimation_procedure_list(self):
estimation_procedures = openml.tasks.functions.\
_get_estimation_procedure_list()
self.assertIsInstance(estimation_procedures, list)
self.assertIsInstance(estimation_procedures[0], dict)
self.assertEqual(estimation_procedures[0]['task_type_id'], 1)
def test_list_all(self):
openml.utils._list_all(listing_call=openml.tasks.functions._list_tasks)
def get_dataset_ids(task_ids):
""" Fetches the dataset_ids.
# Arguments
task_ids: List of ids of OpenML task flows
# Returns
dataset_list: List of the dataset Ids
"""
if type(task_ids) == list:
return [openml.tasks.get_task(t_id).dataset_id for t_id in task_ids]
else:
return openml.tasks.get_task(task_ids).dataset_id
repeat = int(evaluation_dict['@repeat'])
fold = int(evaluation_dict['@fold'])
if key not in fold_evaluations:
fold_evaluations[key] = OrderedDict()
if repeat not in fold_evaluations[key]:
fold_evaluations[key][repeat] = OrderedDict()
fold_evaluations[key][repeat][fold] = value
else:
evaluations[key] = value
if 'description' not in files and from_server is True:
raise ValueError('No description file for run %d in run '
'description XML' % run_id)
if 'predictions' not in files and from_server is True:
task = openml.tasks.get_task(task_id)
if task.task_type_id == TaskTypeEnum.SUBGROUP_DISCOVERY:
raise NotImplementedError(
'Subgroup discovery tasks are not yet supported.'
)
else:
# JvR: actually, I am not sure whether this error should be raised.
# a run can consist without predictions. But for now let's keep it
# Matthias: yes, it should stay as long as we do not really handle
# this stuff
raise ValueError('No prediction files for run %d in run '
'description XML' % run_id)
tags = openml.utils.extract_xml_tags('oml:tag', run)
return OpenMLRun(run_id=run_id, uploader=uploader,
uploader_name=uploader_name, task_id=task_id,
----------
task_ids : iterable
dataset_ids : iterable
flow_ids : iterable
run_ids : iterable
Returns
-------
None
"""
if task_ids is not None:
for task_id in task_ids:
tasks.functions.get_task(task_id)
if dataset_ids is not None:
for dataset_id in dataset_ids:
datasets.functions.get_dataset(dataset_id)
if flow_ids is not None:
for flow_id in flow_ids:
flows.functions.get_flow(flow_id)
if run_ids is not None:
for run_id in run_ids:
runs.functions.get_run(run_id)
# single task by its ID, and one which takes a list of IDs and downloads
# all of these tasks:
task_id = 31
task = openml.tasks.get_task(task_id)
############################################################################
# Properties of the task are stored as member variables:
print(task)
############################################################################
# And:
ids = [2, 1891, 31, 9983]
tasks = openml.tasks.get_tasks(ids)
print(tasks[0])
############################################################################
# Creating tasks
# ^^^^^^^^^^^^^^
#
# You can also create new tasks. Take the following into account:
#
# * You can only create tasks on _active_ datasets
# * For now, only the following tasks are supported: classification, regression,
# clustering, and learning curve analysis.
# * For now, tasks can only be created on a single dataset.
# * The exact same task must not already exist.
#
# Creating a task requires the following input:
#
print(suite)
####################################################################################################
# The benchmark suite does not download the included tasks and datasets itself, but only contains
# a list of which tasks constitute the study.
#
# Tasks can then be accessed via
tasks = suite.tasks
print(tasks)
####################################################################################################
# and iterated over for benchmarking. For speed reasons we only iterate over the first three tasks:
for task_id in tasks[:3]:
task = openml.tasks.get_task(task_id)
print(task)