Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
print("Number of minibatches: {}".format(len(train_dataloader)))
if not os.path.isdir(args.exp_dir):
os.makedirs(args.exp_dir)
# ceate model
model_config = config["model_config"]
model = lstm.LSTMAM(model_config["feat_dim"], model_config["label_size"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True)
model.cuda()
# setup the optimizer
optimizer = th.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
# Broadcast parameters and opterimizer state from rank 0 to all other processes.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
# Add Horovod Distributed Optimizer
optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters())
if os.path.isfile(args.seed_model):
checkpoint = th.load(args.seed_model)
state_dict = checkpoint['model']
model.load_state_dict(state_dict)
print("=> loaded checkpoint '{}' ".format(args.seed_model))
else:
sys.stderr.write('ERROR: The model file %s does not exist!\n'%(args.seed_model))
sys.exit(0)
HCLG = args.den_dir + "/HCLG.fst"
words_txt = args.den_dir + "/words.txt"
# By default, Adasum doesn't need scaling up learning rate.
lr_scaler = hvd.size() if not args.use_adasum else 1
if args.cuda:
# Move model to GPU.
model.cuda()
# If using GPU Adasum allreduce, scale learning rate by local_size.
if args.use_adasum and hvd.nccl_built():
lr_scaler = hvd.local_size()
# Horovod: scale learning rate by lr_scaler.
optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler,
momentum=args.momentum)
# Horovod: broadcast parameters & optimizer state.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
# Horovod: (optional) compression algorithm.
compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none
# Horovod: wrap optimizer with DistributedOptimizer.
optimizer = hvd.DistributedOptimizer(optimizer,
named_parameters=model.named_parameters(),
compression=compression,
op=hvd.Adasum if args.use_adasum else hvd.Average)
def train(epoch):
model.train()
# Horovod: set epoch to sampler for shuffling.
train_sampler.set_epoch(epoch)
# Restore from a previous checkpoint, if initial_epoch is specified.
# Horovod: restore on the first worker which will broadcast weights to other workers.
if resume_from_epoch > 0 and hvd.rank() == 0:
filepath = args.checkpoint_format.format(epoch=resume_from_epoch)
checkpoint = torch.load(filepath)
model.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])
if args.label_smoothing:
criterion = cross_encropy_with_label_smoothing
else:
criterion = nn.CrossEntropyLoss()
# Horovod: broadcast parameters & optimizer state.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
def train(epoch):
model.train()
train_sampler.set_epoch(epoch)
train_loss = Metric('train_loss')
train_accuracy = Metric('train_accuracy')
with tqdm(total=len(train_loader),
desc='Train Epoch #{}'.format(epoch + 1),
disable=not verbose) as t:
for batch_idx, (data, target) in enumerate(train_loader):
lr_cur = adjust_learning_rate(epoch, batch_idx, type=args.lr_scheduler)
if args.cuda:
optimizer, is_dynamic_opt = get_torch_optimizer(optimizer, model, num_steps_per_epoch, param_groups)
start_epoch = 0
latest_path = latest_checkpoint + '.pyt' if latest_checkpoint else os.path.join(FLAGS.model_dir, 'latest.pyt')
if not os.path.exists(latest_path):
latest_path = os.path.join(FLAGS.model_dir, 'latest.pyt')
if os.path.exists(latest_path):
# NOTICE must only load 0! if all load now will OOM...
if not use_horovod or hvd.rank() == 0:
logging.info('loading torch model from', latest_path)
checkpoint = load_torch_model(model, latest_path)
if FLAGS.torch_load_optimizer:
optimizer.load_state_dict(checkpoint['optimizer'])
if use_horovod:
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
optimizer_ = optimizer if not hasattr(optimizer, 'optimizer') else optimizer.optimizer
hvd.broadcast_optimizer_state(optimizer_, root_rank=0)
if not FLAGS.torch_finetune:
if not use_horovod or hvd.rank() == 0:
start_epoch = checkpoint['epoch']
step = checkpoint['step']
else:
start_epoch = 0
step = 0
if use_horovod:
temp = np.array([start_epoch, step])
comm.Bcast(temp, root=0)
start_epoch, step = temp[0], temp[1]
# ceate model
model_config = config["model_config"]
model = lstm.LSTMStack(model_config["feat_dim"], model_config["label_size"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True)
# Start training
th.backends.cudnn.enabled = True
if th.cuda.is_available():
model.cuda()
# optimizer
optimizer = th.optim.Adam(model.parameters(), lr=args.lr, amsgrad=True)
if args.hvd:
# Broadcast parameters and opterimizer state from rank 0 to all other processes.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
# Add Horovod Distributed Optimizer
optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters())
# criterion
criterion = nn.CrossEntropyLoss(ignore_index=-100)
start_epoch = 0
if args.resume_from_model:
assert os.path.isfile(args.resume_from_model), "ERROR: model file {} does not exit!".format(args.resume_from_model)
checkpoint = th.load(args.resume_from_model)
state_dict = checkpoint['model']
start_epoch = checkpoint['epoch']
net = model.Network(**model_kwargs)
# Model CPU -> GPU
if use_cuda:
torch.cuda.set_device(hvd.local_rank())
net = net.cuda()
# Creating a Socket for a network
socket = model.Socket(net)
# Preparing Socket for Horovod training
for opt_index in range(len(socket.optimizers)):
socket.optimizers[opt_index].optimizer = hvd.DistributedOptimizer(
socket.optimizers[opt_index].optimizer)
hvd.broadcast_parameters(socket.model.state_dict(), root_rank=0)
# Creating the datasets
ds_index = dataset.DataSetIndex(**dataset_kwargs)
train_dataset = dataset.DataSet(
ds_index, mode='train')
valid_dataset = dataset.DataSet(
ds_index, mode='valid')
test_dataset = dataset.DataSet(
ds_index, mode='test')
# Preparing the datasets for Horovod
train_sampler = torch.utils.data.distributed.DistributedSampler(
train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
# create model
if args.pretrained:
print("=> using pre-trained model '{}'".format(args.arch))
model = models.__dict__[args.arch](pretrained=True)
else:
print("=> creating model '{}'".format(args.arch))
model = models.__dict__[args.arch]()
model.cuda()
# When using a single GPU per process and per
# DistributedDataParallel, we need to divide the batch size
# ourselves based on the total number of GPUs we have
args.batch_size = int(args.batch_size / ngpus_per_node)
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss().cuda()
optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
compression = hvd.Compression.fp16
optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters(), compression=compression)
cudnn.benchmark = True
# Data loading code
traindir = os.path.join(args.data, 'train')
valdir = os.path.join(args.data, 'val')
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
"params": [
p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)
],
"weight_decay": 0.0,
},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=1e-6)
if distributed:
optimizer = hvd.DistributedOptimizer(
optimizer,
named_parameters=self.model.named_parameters(),
backward_passes_per_step=gradient_accumulation_steps,
)
hvd.broadcast_parameters(self.model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
if warmup_proportion:
warmup_steps = t_total * warmup_proportion
else:
warmup_steps = 0
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total)
global_step = 0
tr_loss = 0.0
# self.model.zero_grad()
self.model.train()
train_iterator = trange(int(num_epochs), desc="Epoch")
for _ in train_iterator:
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", mininterval=60)):
kwargs = {'num_workers': 6, 'pin_memory': True}
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=_BATCHSIZE, sampler=train_sampler, **kwargs)
# Autotune
cudnn.benchmark = True
logger.info("Loading model")
# Load symbol
model = models.__dict__['resnet50'](pretrained=False)
model.cuda()
if _DISTRIBUTED:
# Horovod: broadcast parameters.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
num_gpus= hvd.size() if _DISTRIBUTED else 1
# Horovod: scale learning rate by the number of GPUs.
optimizer = optim.SGD(model.parameters(), lr=_LR * num_gpus,
momentum=0.9)
if _DISTRIBUTED:
# Horovod: wrap optimizer with DistributedOptimizer.
optimizer = hvd.DistributedOptimizer(
optimizer, named_parameters=model.named_parameters())
criterion=F.cross_entropy
# Main training-loop
logger.info("Training ...")
for epoch in range(_EPOCHS):
with Timer(output=logger.info, prefix="Training") as t:
model.train()
print("Number of minibatches: {}".format(len(train_dataloader)))
if not os.path.isdir(args.exp_dir):
os.makedirs(args.exp_dir)
# ceate model
model_config = config["model_config"]
model = lstm.LSTMAM(model_config["feat_dim"], model_config["label_size"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True)
model.cuda()
# setup the optimizer
optimizer = th.optim.Adam(model.parameters(), lr=args.lr, amsgrad=True)
# Broadcast parameters and opterimizer state from rank 0 to all other processes.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
# Add Horovod Distributed Optimizer
optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters())
if os.path.isfile(args.seed_model):
checkpoint = th.load(args.seed_model)
state_dict = checkpoint['model']
from collections import OrderedDict
new_state_dict = OrderedDict()
for k, v in state_dict.items():
header = k[:7]
name = k[7:] # remove 'module.' of dataparallel
new_state_dict[name]=v
if header == "module.":
model.load_state_dict(new_state_dict)