Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
train_set_metadata[DATA_TRAIN_HDF5_FP] = data_train_hdf5_fp
if test_set is not None:
data_test_hdf5_fp = replace_file_extension(data_test_csv,
'hdf5')
data_utils.save_hdf5(
data_test_hdf5_fp,
test_set,
train_set_metadata
)
train_set_metadata[DATA_TRAIN_HDF5_FP] = data_train_hdf5_fp
logger.info('Writing train set metadata with vocabulary')
train_set_metadata_json_fp = replace_file_extension(data_train_csv,
'json')
data_utils.save_json(
train_set_metadata_json_fp,
train_set_metadata,
)
return training_set, test_set, validation_set, train_set_metadata
if validation_set is not None:
logger.info('Validation set: {0}'.format(validation_set.size))
if test_set is not None:
logger.info('Test set: {0}'.format(test_set.size))
# update model definition with metadata properties
update_model_definition_with_metadata(
model_definition,
train_set_metadata
)
if is_on_master():
if not skip_save_model:
# save train set metadata
os.makedirs(model_dir, exist_ok=True)
save_json(
os.path.join(
model_dir,
TRAIN_SET_METADATA_FILE_NAME
),
train_set_metadata
)
# run the experiment
model, result = train(
training_set=training_set,
validation_set=validation_set,
test_set=test_set,
model_definition=model_definition,
save_path=model_dir,
model_load_path=model_load_path,
resume=model_resume_path is not None,
description = get_experiment_description(
model_definition,
data_csv=data_csv,
data_train_csv=data_train_csv,
data_validation_csv=data_validation_csv,
data_test_csv=data_test_csv,
data_hdf5=data_hdf5,
data_train_hdf5=data_train_hdf5,
data_validation_hdf5=data_validation_hdf5,
data_test_hdf5=data_test_hdf5,
metadata_json=train_set_metadata_json,
random_seed=random_seed
)
if is_on_master():
if not skip_save_training_description:
save_json(description_fn, description)
# print description
logger.info('Experiment name: {}'.format(experiment_name))
logger.info('Model name: {}'.format(model_name))
logger.info('Output path: {}'.format(experiment_dir_name))
logger.info('\n')
for key, value in description.items():
logger.info('{}: {}'.format(key, pformat(value, indent=4)))
logger.info('\n')
# preprocess
preprocessed_data = preprocess_for_training(
model_definition,
data_df=data_df,
data_train_df=data_train_df,
data_validation_df=data_validation_df,
data_test_df=data_test_df,
def save_test_statistics(test_stats, experiment_dir_name):
test_stats_fn = os.path.join(
experiment_dir_name,
'test_statistics.json'
)
save_json(test_stats_fn, test_stats)
'initialization and training set shuffling'
)
args = parser.parse_args()
data, train_set_metadata = build_dataset(
args.dataset_csv,
args.train_set_metadata_json,
args.features,
args.preprocessing_parameters,
args.random_seed
)
# write train set metadata, dataset
logger.info('Writing train set metadata with vocabulary')
data_utils.save_json(args.output_metadata_json, train_set_metadata)
logger.info('Writing dataset')
data_utils.save_hdf5(args.output_dataset_h5, data, train_set_metadata)
)
train_trainset_stats, train_valisest_stats, train_testset_stats = result
train_stats = {
'train': train_trainset_stats,
'validation': train_valisest_stats,
'test': train_testset_stats
}
if should_close_session:
model.close_session()
# save training statistics
if is_on_master():
if not skip_save_training_statistics:
save_json(training_stats_fn, train_stats)
# grab the results of the model with highest validation test performance
validation_field = model_definition['training']['validation_field']
validation_measure = model_definition['training']['validation_measure']
validation_field_result = train_valisest_stats[validation_field]
best_function = get_best_function(validation_measure)
# results of the model with highest validation test performance
if is_on_master() and validation_set is not None:
epoch_best_vali_measure, best_vali_measure = best_function(
enumerate(validation_field_result[validation_measure]),
key=lambda pair: pair[1]
)
logger.info(
'Best validation model epoch: {0}'.format(
epoch_best_vali_measure + 1)
def save_hyperparameters(self, hyperparameters, save_path):
# removing pretrained embeddings paths from hyperparameters
# because the weights are already saved in the model, no need to reload
# from their path when loading the model next time
local_hyperparamters = copy.deepcopy(hyperparameters)
for feature in (local_hyperparamters['input_features'] +
local_hyperparamters['output_features']):
if 'pretrained_embeddings' in feature:
feature['pretrained_embeddings'] = None
save_json(save_path, hyperparameters, sort_keys=True, indent=4)