Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
logger.info(
'Using full raw csv, no hdf5 and json file '
'with the same name have been found'
)
logger.info('Building dataset (it may take a while)')
data, train_set_metadata = build_dataset(
data_csv,
features,
preprocessing_params,
train_set_metadata=train_set_metadata,
random_seed=random_seed
)
if not skip_save_processed_input:
logger.info('Writing dataset')
data_hdf5_fp = replace_file_extension(data_csv, 'hdf5')
data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata)
train_set_metadata[DATA_TRAIN_HDF5_FP] = data_hdf5_fp
logger.info('Writing train set metadata with vocabulary')
train_set_metadata_json_fp = replace_file_extension(
data_csv,
'json'
)
data_utils.save_json(
train_set_metadata_json_fp, train_set_metadata)
training_set, test_set, validation_set = split_dataset_tvt(
data,
data['split']
)
elif data_train_csv is not None:
if validation_set is not None:
data_validation_hdf5_fp = replace_file_extension(
data_validation_csv,
'hdf5'
)
data_utils.save_hdf5(
data_validation_hdf5_fp,
validation_set,
train_set_metadata
)
train_set_metadata[DATA_TRAIN_HDF5_FP] = data_train_hdf5_fp
if test_set is not None:
data_test_hdf5_fp = replace_file_extension(data_test_csv,
'hdf5')
data_utils.save_hdf5(
data_test_hdf5_fp,
test_set,
train_set_metadata
)
train_set_metadata[DATA_TRAIN_HDF5_FP] = data_train_hdf5_fp
logger.info('Writing train set metadata with vocabulary')
train_set_metadata_json_fp = replace_file_extension(data_train_csv,
'json')
data_utils.save_json(
train_set_metadata_json_fp,
train_set_metadata,
)
return training_set, test_set, validation_set, train_set_metadata
concatenated_df.csv = data_train_csv
data, train_set_metadata = build_dataset_df(
concatenated_df,
features,
preprocessing_params,
train_set_metadata=train_set_metadata,
random_seed=random_seed
)
training_set, test_set, validation_set = split_dataset_tvt(
data,
data['split']
)
if not skip_save_processed_input:
logger.info('Writing dataset')
data_train_hdf5_fp = replace_file_extension(data_train_csv, 'hdf5')
data_utils.save_hdf5(
data_train_hdf5_fp,
training_set,
train_set_metadata
)
train_set_metadata[DATA_TRAIN_HDF5_FP] = data_train_hdf5_fp
if validation_set is not None:
data_validation_hdf5_fp = replace_file_extension(
data_validation_csv,
'hdf5'
)
data_utils.save_hdf5(
data_validation_hdf5_fp,
validation_set,
train_set_metadata
)
train_set_metadata[DATA_TRAIN_HDF5_FP] = data_train_hdf5_fp
)
if not skip_save_processed_input:
logger.info('Writing dataset')
data_train_hdf5_fp = replace_file_extension(data_train_csv, 'hdf5')
data_utils.save_hdf5(
data_train_hdf5_fp,
training_set,
train_set_metadata
)
train_set_metadata[DATA_TRAIN_HDF5_FP] = data_train_hdf5_fp
if validation_set is not None:
data_validation_hdf5_fp = replace_file_extension(
data_validation_csv,
'hdf5'
)
data_utils.save_hdf5(
data_validation_hdf5_fp,
validation_set,
train_set_metadata
)
train_set_metadata[DATA_TRAIN_HDF5_FP] = data_train_hdf5_fp
if test_set is not None:
data_test_hdf5_fp = replace_file_extension(data_test_csv,
'hdf5')
data_utils.save_hdf5(
data_test_hdf5_fp,
test_set,
train_set_metadata
)
train_set_metadata[DATA_TRAIN_HDF5_FP] = data_train_hdf5_fp