Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _bunch_to_df(bunch, schema_X, schema_y, test_size=0.2, random_state=42):
train_X_arr, test_X_arr, train_y_arr, test_y_arr = train_test_split(
bunch.data, bunch.target,
test_size=test_size, random_state=random_state)
feature_schemas = schema_X['items']['items']
if isinstance(feature_schemas, list):
feature_names = [f['description'] for f in feature_schemas]
else:
feature_names = [f'x{i}' for i in range(schema_X['items']['maxItems'])]
train_X_df = pd.DataFrame(train_X_arr, columns=feature_names)
test_X_df = pd.DataFrame(test_X_arr, columns=feature_names)
train_y_df = pd.Series(train_y_arr, name='target')
test_y_df = pd.Series(test_y_arr, name='target')
train_nrows, test_nrows = train_X_df.shape[0], test_X_df.shape[0]
train_X = lale.datasets.data_schemas.add_schema(train_X_df, {
**schema_X, 'minItems': train_nrows, 'maxItems': train_nrows })
test_X = lale.datasets.data_schemas.add_schema(test_X_df, {
**schema_X, 'minItems': test_nrows, 'maxItems': test_nrows })
train_y = lale.datasets.data_schemas.add_schema(train_y_df, {
**schema_y, 'minItems': train_nrows, 'maxItems': train_nrows })
test_y = lale.datasets.data_schemas.add_schema(test_y_df, {
**schema_y, 'minItems': test_nrows, 'maxItems': test_nrows })
return (train_X, train_y), (test_X, test_y)
train_X_arr, test_X_arr, train_y_arr, test_y_arr = train_test_split(
bunch.data, bunch.target,
test_size=test_size, random_state=random_state)
feature_schemas = schema_X['items']['items']
if isinstance(feature_schemas, list):
feature_names = [f['description'] for f in feature_schemas]
else:
feature_names = [f'x{i}' for i in range(schema_X['items']['maxItems'])]
train_X_df = pd.DataFrame(train_X_arr, columns=feature_names)
test_X_df = pd.DataFrame(test_X_arr, columns=feature_names)
train_y_df = pd.Series(train_y_arr, name='target')
test_y_df = pd.Series(test_y_arr, name='target')
train_nrows, test_nrows = train_X_df.shape[0], test_X_df.shape[0]
train_X = lale.datasets.data_schemas.add_schema(train_X_df, {
**schema_X, 'minItems': train_nrows, 'maxItems': train_nrows })
test_X = lale.datasets.data_schemas.add_schema(test_X_df, {
**schema_X, 'minItems': test_nrows, 'maxItems': test_nrows })
train_y = lale.datasets.data_schemas.add_schema(train_y_df, {
**schema_y, 'minItems': train_nrows, 'maxItems': train_nrows })
test_y = lale.datasets.data_schemas.add_schema(test_y_df, {
**schema_y, 'minItems': test_nrows, 'maxItems': test_nrows })
return (train_X, train_y), (test_X, test_y)
cols_X = [col for col in col_names if col != target_col]
X = df_all[cols_X]
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(y)
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size = test_size, random_state = 0)
if verbose:
print(f'training set shapes: X {X_train.shape}, y {y_train.shape}')
print(f'test set shapes: X {X_test.shape}, y {y_test.shape}')
if preprocess:
from lale.datasets.data_schemas import add_schema
X_train = add_schema(X_train.astype(np.number), recalc=True)
y_train = add_schema(y_train.astype(np.int), recalc=True)
X_test = add_schema(X_test.astype(np.number), recalc=True)
y_test = add_schema(y_test.astype(np.int), recalc=True)
else:
X_train, X_test, y_train, y_test = add_schemas( \
schema_orig, target_col, X_train, X_test, y_train, y_test)
return (X_train, y_train), (X_test, y_test)
if isinstance(feature_schemas, list):
feature_names = [f['description'] for f in feature_schemas]
else:
feature_names = [f'x{i}' for i in range(schema_X['items']['maxItems'])]
train_X_df = pd.DataFrame(train_X_arr, columns=feature_names)
test_X_df = pd.DataFrame(test_X_arr, columns=feature_names)
train_y_df = pd.Series(train_y_arr, name='target')
test_y_df = pd.Series(test_y_arr, name='target')
train_nrows, test_nrows = train_X_df.shape[0], test_X_df.shape[0]
train_X = lale.datasets.data_schemas.add_schema(train_X_df, {
**schema_X, 'minItems': train_nrows, 'maxItems': train_nrows })
test_X = lale.datasets.data_schemas.add_schema(test_X_df, {
**schema_X, 'minItems': test_nrows, 'maxItems': test_nrows })
train_y = lale.datasets.data_schemas.add_schema(train_y_df, {
**schema_y, 'minItems': train_nrows, 'maxItems': train_nrows })
test_y = lale.datasets.data_schemas.add_schema(test_y_df, {
**schema_y, 'minItems': test_nrows, 'maxItems': test_nrows })
return (train_X, train_y), (test_X, test_y)
-------
result : tuple
- item 0: pandas Dataframe or None, features X
- item 1: pandas Series or None, labels y
"""
if 'X' in return_only:
X = pd.DataFrame(dataset.features, columns=dataset.feature_names)
result_X = lale.datasets.data_schemas.add_schema(X)
assert isinstance(result_X, pd.DataFrame), type(result_X)
else:
result_X = None
if 'y' in return_only:
y = pd.Series(dataset.labels.ravel(), name=dataset.label_names[0])
result_y = lale.datasets.data_schemas.add_schema(y)
assert isinstance(result_y, pd.Series), type(result_y)
else:
result_y = None
return result_X, result_y
if item_schema['description'].lower() != target_col]
elem_y = [item_schema for item_schema in schema_orig['items']['items']
if item_schema['description'].lower() == target_col][0]
if 'enum' in elem_y:
elem_y['enum'] = [*range(len(elem_y['enum']))]
ncols_X = len(elems_X)
rows_X = {
**schema_orig['items'],
'minItems': ncols_X, 'maxItems': ncols_X, 'items': elems_X}
if 'json_schema' not in pd.DataFrame._internal_names:
pd.DataFrame._internal_names.append('json_schema')
nrows_train, nrows_test = len(train_y), len(test_y)
train_X = add_schema(train_X, {
**schema_orig,
'minItems': nrows_train, 'maxItems': nrows_train, 'items': rows_X})
test_X = add_schema(test_X, {
**schema_orig,
'minItems': nrows_test, 'maxItems': nrows_test, 'items': rows_X})
train_y = add_schema(train_y, {
**schema_orig,
'minItems': nrows_train, 'maxItems': nrows_train, 'items': elem_y})
test_y = add_schema(test_y, {
**schema_orig,
'minItems': nrows_test, 'maxItems': nrows_test, 'items': elem_y})
return train_X, test_X, train_y, test_y
test_size=test_size, random_state=random_state)
feature_schemas = schema_X['items']['items']
if isinstance(feature_schemas, list):
feature_names = [f['description'] for f in feature_schemas]
else:
feature_names = [f'x{i}' for i in range(schema_X['items']['maxItems'])]
train_X_df = pd.DataFrame(train_X_arr, columns=feature_names)
test_X_df = pd.DataFrame(test_X_arr, columns=feature_names)
train_y_df = pd.Series(train_y_arr, name='target')
test_y_df = pd.Series(test_y_arr, name='target')
train_nrows, test_nrows = train_X_df.shape[0], test_X_df.shape[0]
train_X = lale.datasets.data_schemas.add_schema(train_X_df, {
**schema_X, 'minItems': train_nrows, 'maxItems': train_nrows })
test_X = lale.datasets.data_schemas.add_schema(test_X_df, {
**schema_X, 'minItems': test_nrows, 'maxItems': test_nrows })
train_y = lale.datasets.data_schemas.add_schema(train_y_df, {
**schema_y, 'minItems': train_nrows, 'maxItems': train_nrows })
test_y = lale.datasets.data_schemas.add_schema(test_y_df, {
**schema_y, 'minItems': test_nrows, 'maxItems': test_nrows })
return (train_X, train_y), (test_X, test_y)