Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"""
classes_without_random_state = \
['sklearn.model_selection._search.GridSearchCV',
'sklearn.pipeline.Pipeline',
'sklearn.linear_model.base.LinearRegression',
]
def _remove_random_state(flow):
if 'random_state' in flow.parameters:
del flow.parameters['random_state']
for component in flow.components.values():
_remove_random_state(component)
flow = self.extension.model_to_flow(clf)
flow, _ = self._add_sentinel_to_flow_name(flow, sentinel)
if not openml.flows.flow_exists(flow.name, flow.external_version):
flow.publish()
TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
TestBase.logger.info("collected from test_run_functions: {}".format(flow.flow_id))
task = openml.tasks.get_task(task_id)
X, y = task.get_X_and_y()
self.assertEqual(np.count_nonzero(np.isnan(X)), n_missing_vals)
run = openml.runs.run_flow_on_task(
flow=flow,
task=task,
seed=seed,
avoid_duplicate_runs=openml.config.avoid_duplicate_runs,
)
run_ = run.publish()
TestBase._mark_entity_for_removal('run', run.run_id)
def test_tagging(self):
flow_list = openml.flows.list_flows(size=1)
flow_id = list(flow_list.keys())[0]
flow = openml.flows.get_flow(flow_id)
tag = "testing_tag_{}_{}".format(self.id(), time.time())
flow_list = openml.flows.list_flows(tag=tag)
self.assertEqual(len(flow_list), 0)
flow.push_tag(tag)
flow_list = openml.flows.list_flows(tag=tag)
self.assertEqual(len(flow_list), 1)
self.assertIn(flow_id, flow_list)
flow.remove_tag(tag)
flow_list = openml.flows.list_flows(tag=tag)
self.assertEqual(len(flow_list), 0)
def initialize_model(setup_id: int) -> Any:
"""
Initialized a model based on a setup_id (i.e., using the exact
same parameter settings)
Parameters
----------
setup_id : int
The Openml setup_id
Returns
-------
model
"""
setup = get_setup(setup_id)
flow = openml.flows.get_flow(setup.flow_id)
# instead of using scikit-learns or any other library's "set_params" function, we override the
# OpenMLFlow objects default parameter value so we can utilize the
# Extension.flow_to_model() function to reinitialize the flow with the set defaults.
for hyperparameter in setup.parameters.values():
structure = flow.get_structure('flow_id')
if len(structure[hyperparameter.flow_id]) > 0:
subflow = flow.get_subflow(structure[hyperparameter.flow_id])
else:
subflow = flow
subflow.parameters[hyperparameter.parameter_name] = \
hyperparameter.value
model = flow.extension.flow_to_model(flow)
return model
# pprint(vars(run), depth=2)
############################################################################
# Share the run on the OpenML server
#
# So far the run is only available locally. By calling the publish function, the run is send to the OpenML server:
myrun = run.publish()
# For this tutorial, our configuration publishes to the test server
# as to not pollute the main server.
print("Uploaded to http://test.openml.org/r/" + str(myrun.run_id))
############################################################################
# We can now also inspect the flow object which was automatically created:
flow = openml.flows.get_flow(run.flow_id)
pprint(vars(flow), depth=1)
############################################################################
# It also works with pipelines
# ############################
#
# When you need to handle 'dirty' data, build pipelines to model then automatically.
task = openml.tasks.get_task(115)
pipe = pipeline.Pipeline(steps=[
('Imputer', preprocessing.Imputer(strategy='median')),
('OneHotEncoder', preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')),
('Classifier', ensemble.RandomForestClassifier())
])
flow = openml.flows.sklearn_to_flow(pipe)
run = openml.runs.run_flow_on_task(flow, task, avoid_duplicate_runs=False)
def get_flow_overview():
"""
:return: overview page for flows
"""
df = flows.list_flows(output_format='dataframe')
count = pd.DataFrame(df["name"].value_counts()).reset_index()
count.columns = ["name", "count"]
count = count[0:1000]
short = []
for name in count["name"]:
try:
short.append(SklearnExtension.trim_flow_name(name))
except:
pass
count["name"] = short
fig = go.Figure(data=[go.Bar(y=count["name"].values, x=count["count"].values,
marker=dict(color='blue',
opacity=0.8),
orientation="h")])
fig.update_layout(
yaxis=dict(autorange="reversed"),
def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
""" Collect all information to display in the __repr__ body. """
fields = {"Uploader Name": self.uploader_name,
"Metric": self.task_evaluation_measure,
"Run ID": self.run_id,
"Task ID": self.task_id,
"Task Type": self.task_type,
"Task URL": openml.tasks.OpenMLTask.url_for_id(self.task_id),
"Flow ID": self.flow_id,
"Flow Name": self.flow_name,
"Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id),
"Setup ID": self.setup_id,
"Setup String": self.setup_string,
"Dataset ID": self.dataset_id,
"Dataset URL": openml.datasets.OpenMLDataset.url_for_id(self.dataset_id)}
if self.uploader is not None:
fields["Uploader Profile"] = "{}/u/{}".format(openml.config.get_server_base_url(),
self.uploader)
if self.run_id is not None:
fields["Run URL"] = self.openml_url
if self.evaluations is not None and self.task_evaluation_measure in self.evaluations:
fields["Result"] = self.evaluations[self.task_evaluation_measure]
# determines the order in which the information will be printed
order = ["Uploader Name", "Uploader Profile", "Metric", "Result", "Run ID", "Run URL",
"Task ID", "Task Type", "Task URL", "Flow ID", "Flow Name", "Flow URL",
"Setup ID", "Setup String", "Dataset ID", "Dataset URL"]
"(This should never happen.) "
)
if self.flow_id is None:
if self.flow is None:
raise PyOpenMLError(
"OpenMLRun object does not contain a flow id or reference to OpenMLFlow "
"(these should have been added while executing the task). "
)
else:
# publish the linked Flow before publishing the run.
self.flow.publish()
self.flow_id = self.flow.flow_id
if self.parameter_settings is None:
if self.flow is None:
self.flow = openml.flows.get_flow(self.flow_id)
self.parameter_settings = self.flow.extension.obtain_parameter_values(
self.flow,
self.model,
)
file_elements = {'description': ("description.xml", self._to_xml())}
if self.error_message is None:
predictions = arff.dumps(self._generate_arff_dict())
file_elements['predictions'] = ("predictions.arff", predictions)
if self.trace is not None:
trace_arff = arff.dumps(self.trace.trace_to_arff())
file_elements['trace'] = ("trace.arff", trace_arff)
return file_elements
flow : OpenMLFlow
OpenMLFlow object (containing flow ids, i.e., it has to be downloaded from the server)
model: Any, optional (default=None)
The model from which to obtain the parameter values. Must match the flow signature.
If None, use the model specified in ``OpenMLFlow.model``.
Returns
-------
list
A list of dicts, where each dict has the following entries:
- ``oml:name`` : str: The OpenML parameter name
- ``oml:value`` : mixed: A representation of the parameter value
- ``oml:component`` : int: flow id to which the parameter belongs
"""
openml.flows.functions._check_flow_for_server_id(flow)
def get_flow_dict(_flow):
flow_map = {_flow.name: _flow.flow_id}
for subflow in _flow.components:
flow_map.update(get_flow_dict(_flow.components[subflow]))
return flow_map
def extract_parameters(_flow, _flow_dict, component_model,
_main_call=False, main_id=None):
def is_subcomponent_specification(values):
# checks whether the current value can be a specification of
# subcomponents, as for example the value for steps parameter
# (in Pipeline) or transformers parameter (in
# ColumnTransformer). These are always lists/tuples of lists/
# tuples, size bigger than 2 and an OpenMLFlow item involved.
if not isinstance(values, (tuple, list)):