Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
parser.add_argument('--momentum', type=float, default=0.9,
help='SGD momentum')
parser.add_argument('--wd', type=float, default=0.00005,
help='weight decay')
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--seed', type=int, default=42,
help='random seed')
args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()
pushpull_batch_size = args.batch_size * args.batches_per_pushpull
bps.init()
torch.manual_seed(args.seed)
if args.cuda:
# BytePS: pin GPU to local rank.
torch.cuda.set_device(bps.local_rank())
torch.cuda.manual_seed(args.seed)
cudnn.benchmark = True
# If set > 0, will resume training from a given checkpoint.
resume_from_epoch = 0
for try_epoch in range(args.epochs, 0, -1):
if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)):
resume_from_epoch = try_epoch
break
opt.src_vocab_size = training_data.dataset.src_vocab_size
opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size
opt.src_vocab_size = 46930
opt.tgt_vocab_size = 23094
#========= Preparing Model =========#
if opt.embs_share_weight:
assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
'The src/tgt word2idx table are different but asked to share word embedding.'
print(opt)
use_horovod = int(os.environ.get("USE_HOROVOD","0"))
if use_horovod > 0:
import byteps.torch as hvd
hvd.init()
#device = torch.cuda.set_device(hvd.local_rank())
device = torch.device('cuda',hvd.local_rank())
else:
device = torch.device('cuda' if opt.cuda else 'cpu')
#device = torch.cuda.set_device(hvd.local_rank())
transformer = Transformer(
opt.src_vocab_size,
opt.tgt_vocab_size,
opt.max_token_seq_len,
tgt_emb_prj_weight_sharing=opt.proj_share_weight,
emb_src_tgt_weight_sharing=opt.embs_share_weight,
d_k=opt.d_k,
d_v=opt.d_v,
d_model=opt.d_model,
d_word_vec=opt.d_word_vec,
d_inner=opt.d_inner_hid,
help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument('--fp16',
action='store_true',
help="Whether to use 16-bit float precision instead of 32-bit")
parser.add_argument('--loss_scale',
type=float, default=0,
help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
"0 (default value): dynamic loss scaling.\n"
"Positive power of 2: static loss scaling value.\n")
parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
args = parser.parse_args()
use_horovod = int(os.getenv("USE_HOROVOD"))
print("env variable USE_HOROVOD:", use_horovod)
if use_horovod == 1:
hvd.init()
args.local_rank = hvd.local_rank()
print("use horovod, local rank:", args.local_rank)
args.output_dir = args.output_dir + "_" + str(args.local_rank)
if args.server_ip and args.server_port:
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
import ptvsd
print("Waiting for debugger attach")
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
ptvsd.wait_for_attach()
processors = {
"cola": ColaProcessor,
"mnli": MnliProcessor,
"mrpc": MrpcProcessor,
"sst-2": Sst2Processor,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--num-warmup', type=int, default=10,
help='number of warm-up steps that don\'t count towards benchmark')
parser.add_argument('--num-iters', type=int, default=1000,
help='number of benchmark iterations')
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA')
parser.add_argument('--no-wait', type=bool, default=True,
help='wait for other worker request first')
parser.add_argument('--gpu', type=int, default=-1,
help='use a specified gpu')
args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()
bps.init()
# BytePS: pin GPU to local rank.
if args.gpu >= 0:
torch.cuda.set_device(args.gpu)
else:
torch.cuda.set_device(bps.local_rank())
cudnn.benchmark = True
def log(s, nl=True):
if bps.rank() != 0:
return
print(s, end='\n' if nl else '')
help='number of benchmark iterations')
parser.add_argument('--num-classes', type=int, default=1000,
help='number of classes')
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--profiler', action='store_true', default=False,
help='disables profiler')
parser.add_argument('--partition', type=int, default=None,
help='partition size')
args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()
bps.init()
if args.cuda:
# BytePS: pin GPU to local rank.
torch.cuda.set_device(bps.local_rank())
cudnn.benchmark = True
# Set up standard model.
model = getattr(models, args.model)(num_classes=args.num_classes)
if args.cuda:
# Move model to GPU.
model.cuda()
optimizer = optim.SGD(model.parameters(), lr=0.01)
help='learning rate (default: 0.01)')
parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
help='SGD momentum (default: 0.5)')
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--seed', type=int, default=42, metavar='S',
help='random seed (default: 42)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
help='how many batches to wait before logging training status')
parser.add_argument('--fp16-pushpull', action='store_true', default=False,
help='use fp16 compression during pushpull')
args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()
# BytePS: initialize library.
bps.init()
torch.manual_seed(args.seed)
if args.cuda:
# BytePS: pin GPU to local rank.
torch.cuda.set_device(bps.local_rank())
torch.cuda.manual_seed(args.seed)
kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
train_dataset = \
datasets.MNIST('data-%d' % bps.rank(), train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
]))
# BytePS: use DistributedSampler to partition the training data.