Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# Threshold for floating point equality depends on number of
# ranks, since we're comparing against precise multiplication.
if size <= 3 or dtype in ['int32', 'int64']:
threshold = 1
elif size < 10:
threshold = 1e-4
elif size < 15:
threshold = 5e-4
else:
break
if max_difference > threshold:
print("average", count, dtype, dim, max_difference, threshold)
print("tensor", hvd.rank(), tensor)
print("averaged", hvd.rank(), averaged)
assert max_difference <= threshold, 'hvd.allreduce produces \
incorrect results for average'
broadcast_tensor = hvd.broadcast(tensor, root_rank=root_rank,
name=str(count))
if rank != root_rank:
if same(tensor.asnumpy(), root_tensor.asnumpy()):
print("broadcast", count, dtype, dim,
mx.nd.max(tensor == root_tensor))
print("tensor", hvd.rank(), tensor)
print("root_tensor", hvd.rank(), root_tensor)
print("comparison", hvd.rank(), tensor == root_tensor)
assert not same(tensor.asnumpy(), root_tensor.asnumpy()), \
'hvd.broadcast modifies source tensor'
if not same(broadcast_tensor.asnumpy(), root_tensor.asnumpy()):
print("broadcast", count, dtype, dim)
print("broadcast_tensor", hvd.rank(), broadcast_tensor)
print("root_tensor", hvd.rank(), root_tensor)
print("comparison", hvd.rank(),
broadcast_tensor == root_tensor)
assert same(broadcast_tensor.asnumpy(), root_tensor.asnumpy()), \
'hvd.broadcast produces incorrect broadcasted tensor'
fh = logging.FileHandler(os.path.join(args.output_dir, 'finetune_squad.log'),
mode='w')
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
console.setFormatter(formatter)
log.addHandler(console)
log.addHandler(fh)
log.info(args)
if args.comm_backend == 'horovod':
import horovod.mxnet as hvd
hvd.init()
rank = hvd.rank()
size = hvd.size()
local_rank = hvd.local_rank()
else:
rank = 0
size = 1
local_rank = 0
if args.dtype == 'float16':
from mxnet.contrib import amp
amp.init()
model_name = args.bert_model
dataset_name = args.bert_dataset
only_predict = args.only_predict
model_parameters = args.model_parameters
pretrained_bert_parameters = args.pretrained_bert_parameters
def init_comm(backend, gpus):
"""Init communication backend"""
# backend specific implementation
if backend == 'horovod':
try:
import horovod.mxnet as hvd # pylint: disable=import-outside-toplevel
except ImportError:
logging.info('horovod must be installed.')
sys.exit(1)
hvd.init()
store = None
num_workers = hvd.size()
rank = hvd.rank()
local_rank = hvd.local_rank()
is_master_node = rank == local_rank
ctx_l = [mx.gpu(local_rank)]
logging.info('GPU communication supported by horovod')
else:
store = mx.kv.create(backend)
num_workers = store.num_workers
rank = store.rank
local_rank = 0
is_master_node = rank == local_rank
if gpus == '-1' or gpus == '':
ctx_l = [mx.cpu()]
logging.info('Runing on CPU')
else:
ctx_l = [mx.gpu(int(x)) for x in gpus.split(',')]
logging.info('GPU communication supported by KVStore')
def init_comm(backend):
"""Init communication backend"""
# backend specific implementation
if backend == 'horovod':
try:
import horovod.mxnet as hvd # pylint: disable=import-outside-toplevel
except ImportError:
logging.info('horovod must be installed.')
sys.exit(1)
hvd.init()
store = None
num_workers = hvd.size()
rank = hvd.rank()
local_rank = hvd.local_rank()
is_master_node = rank == local_rank
ctxs = [mx.gpu(local_rank)]
else:
# kvstore
store = mx.kv.create(backend)
num_workers = store.num_workers
rank = store.rank
local_rank = 0
is_master_node = rank == local_rank
ctxs = [mx.cpu()] if args.gpus is None or args.gpus == '' else \
[mx.gpu(int(x)) for x in args.gpus.split(',')]
return store, num_workers, rank, local_rank, is_master_node, ctxs
dirname=data_dir)
with zipfile.ZipFile(zip_file_path) as zf:
zf.extractall(data_dir)
input_shape = (1, 28, 28)
batch_size = args.batch_size
train_iter = mx.io.MNISTIter(
image="%s/train-images-idx3-ubyte" % data_dir,
label="%s/train-labels-idx1-ubyte" % data_dir,
input_shape=input_shape,
batch_size=batch_size,
shuffle=True,
flat=False,
num_parts=hvd.size(),
part_index=hvd.rank()
)
val_iter = mx.io.MNISTIter(
image="%s/t10k-images-idx3-ubyte" % data_dir,
label="%s/t10k-labels-idx1-ubyte" % data_dir,
input_shape=input_shape,
batch_size=batch_size,
flat=False,
)
return train_iter, val_iter
input_shape=input_shape,
batch_size=batch_size,
shuffle=True,
flat=False,
num_parts=hvd.size(),
part_index=hvd.rank()
)
val_iter = mx.io.MNISTIter(
image="%s/t10k-images-idx3-ubyte" % data_dir,
label="%s/t10k-labels-idx1-ubyte" % data_dir,
input_shape=input_shape,
batch_size=batch_size,
flat=False,
num_parts=hvd.size(),
part_index=hvd.rank()
)
return train_iter, val_iter
zip_file_path = download("http://data.mxnet.io/mxnet/data/mnist.zip", dirname=data_dir)
with zipfile.ZipFile(zip_file_path) as zf:
zf.extractall(data_dir)
input_shape = (1, 28, 28)
batch_size = args.batch_size
train_iter = mx.io.MNISTIter(
image="%s/train-images-idx3-ubyte" % data_dir,
label="%s/train-labels-idx1-ubyte" % data_dir,
input_shape=input_shape,
batch_size=batch_size,
shuffle=True,
flat=False,
num_parts=hvd.size(),
part_index=hvd.rank(),
)
val_iter = mx.io.MNISTIter(
image="%s/t10k-images-idx3-ubyte" % data_dir,
label="%s/t10k-labels-idx1-ubyte" % data_dir,
input_shape=input_shape,
batch_size=batch_size,
flat=False,
)
return train_iter, val_iter
dirname=data_dir)
with zipfile.ZipFile(zip_file_path) as zf:
zf.extractall(data_dir)
input_shape = (1, 28, 28)
batch_size = args.batch_size
train_iter = mx.io.MNISTIter(
image="%s/train-images-idx3-ubyte" % data_dir,
label="%s/train-labels-idx1-ubyte" % data_dir,
input_shape=input_shape,
batch_size=batch_size,
shuffle=True,
flat=False,
num_parts=hvd.size(),
part_index=hvd.rank()
)
val_iter = mx.io.MNISTIter(
image="%s/t10k-images-idx3-ubyte" % data_dir,
label="%s/t10k-labels-idx1-ubyte" % data_dir,
input_shape=input_shape,
batch_size=batch_size,
flat=False,
num_parts=hvd.size(),
part_index=hvd.rank()
)
return train_iter, val_iter