Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def delete_temporary_data(csv_path):
"""
Helper method to delete temporary data created for running tests. Deletes
the csv and hdf5/json data (if any)
:param csv_path: path to the csv data file
:return: None
"""
if os.path.isfile(csv_path):
os.remove(csv_path)
json_path = replace_file_extension(csv_path, 'json')
if os.path.isfile(json_path):
os.remove(json_path)
hdf5_path = replace_file_extension(csv_path, 'hdf5')
if os.path.isfile(hdf5_path):
os.remove(hdf5_path)
)
concatenated_df.csv = data_train_csv
data, train_set_metadata = build_dataset_df(
concatenated_df,
features,
preprocessing_params,
train_set_metadata=train_set_metadata,
random_seed=random_seed
)
training_set, test_set, validation_set = split_dataset_tvt(
data,
data['split']
)
if not skip_save_processed_input:
logger.info('Writing dataset')
data_train_hdf5_fp = replace_file_extension(data_train_csv, 'hdf5')
data_utils.save_hdf5(
data_train_hdf5_fp,
training_set,
train_set_metadata
)
train_set_metadata[DATA_TRAIN_HDF5_FP] = data_train_hdf5_fp
if validation_set is not None:
data_validation_hdf5_fp = replace_file_extension(
data_validation_csv,
'hdf5'
)
data_utils.save_hdf5(
data_validation_hdf5_fp,
validation_set,
train_set_metadata
)
# Also ignore data and train set metadata needs preprocessing
logger.info(
'Using full raw csv, no hdf5 and json file '
'with the same name have been found'
)
logger.info('Building dataset (it may take a while)')
data, train_set_metadata = build_dataset(
data_csv,
features,
preprocessing_params,
train_set_metadata=train_set_metadata,
random_seed=random_seed
)
if not skip_save_processed_input:
logger.info('Writing dataset')
data_hdf5_fp = replace_file_extension(data_csv, 'hdf5')
data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata)
train_set_metadata[DATA_TRAIN_HDF5_FP] = data_hdf5_fp
logger.info('Writing train set metadata with vocabulary')
train_set_metadata_json_fp = replace_file_extension(
data_csv,
'json'
)
data_utils.save_json(
train_set_metadata_json_fp, train_set_metadata)
training_set, test_set, validation_set = split_dataset_tvt(
data,
data['split']
)
file_exists_with_diff_extension(train_fp, 'json') and
file_exists_with_diff_extension(validation_fp, 'hdf5') and
file_exists_with_diff_extension(test_fp, 'hdf5')):
logger.info(
'Found hdf5 and json with the same filename '
'of the csvs, using them instead.'
)
return preprocess_for_training_by_type(
model_definition,
'hdf5',
train_fp=replace_file_extension(train_fp, 'hdf5'),
validation_fp=replace_file_extension(
validation_fp,
'hdf5'
),
test_fp=replace_file_extension(test_fp, 'hdf5'),
train_set_metadata_json=replace_file_extension(
train_fp,
'json'
),
skip_save_processed_input=skip_save_processed_input,
preprocessing_params=preprocessing_params,
random_seed=random_seed
)
else:
(
training_set,
test_set,
validation_set,
train_set_metadata
) = _preprocess_csv_for_training(
features=features,
file_exists_with_diff_extension(validation_fp, 'hdf5') and
file_exists_with_diff_extension(test_fp, 'hdf5')):
logger.info(
'Found hdf5 and json with the same filename '
'of the csvs, using them instead.'
)
return preprocess_for_training_by_type(
model_definition,
'hdf5',
train_fp=replace_file_extension(train_fp, 'hdf5'),
validation_fp=replace_file_extension(
validation_fp,
'hdf5'
),
test_fp=replace_file_extension(test_fp, 'hdf5'),
train_set_metadata_json=replace_file_extension(
train_fp,
'json'
),
skip_save_processed_input=skip_save_processed_input,
preprocessing_params=preprocessing_params,
random_seed=random_seed
)
else:
(
training_set,
test_set,
validation_set,
train_set_metadata
) = _preprocess_csv_for_training(
features=features,
data_csv=None,
)
model_definition['data_hdf5_fp'] = data_hdf5_fp
if all_data_fp is not None:
if (file_exists_with_diff_extension(all_data_fp, 'hdf5') and
file_exists_with_diff_extension(all_data_fp, 'json')):
# use hdf5 data instead
logger.info(
'Found hdf5 and json with the same filename '
'of the csv, using them instead'
)
return preprocess_for_training_by_type(
model_definition,
'hdf5',
all_data_fp=replace_file_extension(all_data_fp, 'hdf5'),
train_set_metadata_json=replace_file_extension(all_data_fp,
'json'),
skip_save_processed_input=skip_save_processed_input,
preprocessing_params=preprocessing_params,
random_seed=random_seed
)
else:
(
training_set,
test_set,
validation_set,
train_set_metadata
) = _preprocess_csv_for_training(
features=features,
data_csv=all_data_fp,
data_train_csv=None,
data_validation_csv=None,
model_definition['input_features'],
model_definition['output_features'],
split_data=False
)
test_set = None
if test_fp is not None:
test_set = load_data(
test_fp,
model_definition['input_features'],
model_definition['output_features'],
split_data=False
)
elif data_type == 'csv':
data_hdf5_fp = replace_file_extension(
all_data_fp, 'hdf5'
)
model_definition['data_hdf5_fp'] = data_hdf5_fp
if all_data_fp is not None:
if (file_exists_with_diff_extension(all_data_fp, 'hdf5') and
file_exists_with_diff_extension(all_data_fp, 'json')):
# use hdf5 data instead
logger.info(
'Found hdf5 and json with the same filename '
'of the csv, using them instead'
)
return preprocess_for_training_by_type(
model_definition,
'hdf5',
all_data_fp=replace_file_extension(all_data_fp, 'hdf5'),
) = _preprocess_df_for_training(
features,
all_data_df,
train_df,
validation_df,
test_df,
train_set_metadata_json=train_set_metadata_json,
preprocessing_params=preprocessing_params,
random_seed=random_seed
)
elif data_type == 'hdf5' and train_set_metadata_json is None:
raise ValueError('train set metadata file is not found along with hdf5'
' data')
elif data_type == 'hdf5':
if all_data_fp is not None:
data_hdf5_fp = replace_file_extension(all_data_fp, 'hdf5')
logger.info('Using full hdf5 and json')
training_set, test_set, validation_set = load_data(
all_data_fp,
model_definition['input_features'],
model_definition['output_features'],
shuffle_training=True
)
train_set_metadata = load_metadata(train_set_metadata_json)
elif train_fp is not None:
logger.info('Using hdf5 and json')
training_set = load_data(
train_fp,
model_definition['input_features'],
model_definition['output_features'],
split_data=False
)