Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
print("Number of minibatches: {}".format(len(train_dataloader)))
if not os.path.isdir(args.exp_dir):
os.makedirs(args.exp_dir)
# ceate model
model_config = config["model_config"]
model = transformer.TransformerAM(model_config["feat_dim"], args.dim_model, args.nheads, args.ff_size, args.nlayers, args.dropout, model_config["label_size"])
model.cuda()
# setup the optimizer
optimizer = th.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
# Broadcast parameters and opterimizer state from rank 0 to all other processes.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
# Add Horovod Distributed Optimizer
optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters())
if os.path.isfile(args.seed_model):
checkpoint = th.load(args.seed_model)
state_dict = checkpoint['model']
model.load_state_dict(state_dict)
print("=> loaded checkpoint '{}' ".format(args.seed_model))
else:
sys.stderr.write('ERROR: The model file %s does not exist!\n'%(args.seed_model))
sys.exit(0)
HCLG = args.den_dir + "/HCLG.fst"
words_txt = args.den_dir + "/words.txt"
silence_phones = args.den_dir + "/phones/silence.csl"
if not os.path.isdir(args.exp_dir):
os.makedirs(args.exp_dir)
# ceate model
model_config = config["model_config"]
model = lstm.LSTMAM(model_config["feat_dim"], model_config["label_size"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True)
model.cuda()
# setup the optimizer
optimizer = th.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
# Broadcast parameters and opterimizer state from rank 0 to all other processes.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
# Add Horovod Distributed Optimizer
optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters())
if os.path.isfile(args.seed_model):
checkpoint = th.load(args.seed_model)
state_dict = checkpoint['model']
model.load_state_dict(state_dict)
print("=> loaded checkpoint '{}' ".format(args.seed_model))
else:
sys.stderr.write('ERROR: The model file %s does not exist!\n'%(args.seed_model))
sys.exit(0)
HCLG = args.den_dir + "/HCLG.fst"
words_txt = args.den_dir + "/words.txt"
silence_phones = args.den_dir + "/phones/silence.csl"
if not os.path.isdir(args.exp_dir):
os.makedirs(args.exp_dir)
# ceate model
model_config = config["model_config"]
model = lstm.LSTMAM(model_config["feat_dim"], model_config["label_size"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True)
model.cuda()
# setup the optimizer
optimizer = th.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
# Broadcast parameters and opterimizer state from rank 0 to all other processes.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
# Add Horovod Distributed Optimizer
optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters())
if os.path.isfile(args.seed_model):
checkpoint = th.load(args.seed_model)
state_dict = checkpoint['model']
model.load_state_dict(state_dict)
print("=> loaded checkpoint '{}' ".format(args.seed_model))
else:
sys.stderr.write('ERROR: The model file %s does not exist!\n'%(args.seed_model))
sys.exit(0)
HCLG = args.den_dir + "/HCLG.fst"
words_txt = args.den_dir + "/words.txt"
silence_phones = args.den_dir + "/phones/silence.csl"
if not os.path.isdir(args.exp_dir):
os.makedirs(args.exp_dir)
# ceate model
model_config = config["model_config"]
model = lstm.LSTMAM(model_config["feat_dim"], model_config["label_size"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True)
model.cuda()
# setup the optimizer
optimizer = th.optim.Adam(model.parameters(), lr=args.lr, amsgrad=True)
# Broadcast parameters and opterimizer state from rank 0 to all other processes.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
# Add Horovod Distributed Optimizer
optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters())
if os.path.isfile(args.seed_model):
checkpoint = th.load(args.seed_model)
state_dict = checkpoint['model']
from collections import OrderedDict
new_state_dict = OrderedDict()
for k, v in state_dict.items():
header = k[:7]
name = k[7:] # remove 'module.' of dataparallel
new_state_dict[name]=v
if header == "module.":
model.load_state_dict(new_state_dict)
else:
""" Setup for Horovod usage.
Args:
model(MultitaskModel): The MultitaskModel object.
learning_rate(float): Learning rate for the model.
Returns: hvd.DistributedOptimizer: Optimizer to use for computing
gradients and applying updates.
"""
# Horovod: scale learning rate by the number of GPUs.
optimizer = optim.Adam(model.parameters(), lr=learning_rate * hvd.size())
# Horovod: broadcast parameters & optimizer state.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
# Horovod: (optional) compression algorithm.
compression = hvd.Compression.fp16
# Horovod: wrap optimizer with DistributedOptimizer.
optimizer = hvd.DistributedOptimizer(
optimizer,
named_parameters=model.named_parameters(),
compression=compression,
)
return optimizer
p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)
],
"weight_decay": 0.0,
},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=1e-6)
if distributed:
optimizer = hvd.DistributedOptimizer(
optimizer,
named_parameters=self.model.named_parameters(),
backward_passes_per_step=gradient_accumulation_steps,
)
hvd.broadcast_parameters(self.model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
if warmup_proportion:
warmup_steps = t_total * warmup_proportion
else:
warmup_steps = 0
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total)
global_step = 0
tr_loss = 0.0
# self.model.zero_grad()
train_iterator = trange(int(num_epochs), desc="Epoch")
for _ in train_iterator:
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", mininterval=60)):
self.model.train()
batch = tuple(t.to(device) for t in batch)
# ceate model
model_config = config["model_config"]
model = lstm.LSTMStack(model_config["feat_dim"], model_config["label_size"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True)
# Start training
th.backends.cudnn.enabled = True
if th.cuda.is_available():
model.cuda()
# optimizer
optimizer = th.optim.Adam(model.parameters(), lr=args.lr, amsgrad=True)
if args.hvd:
# Broadcast parameters and opterimizer state from rank 0 to all other processes.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
# Add Horovod Distributed Optimizer
optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters())
# criterion
criterion = nn.CrossEntropyLoss(ignore_index=-100)
start_epoch = 0
if args.resume_from_model:
assert os.path.isfile(args.resume_from_model), "ERROR: model file {} does not exit!".format(args.resume_from_model)
checkpoint = th.load(args.resume_from_model)
state_dict = checkpoint['model']
start_epoch = checkpoint['epoch']
model.load_state_dict(state_dict)
train_set, num_replicas=hvd.size(), rank=hvd.rank())
test_sampler = torch.utils.data.distributed.DistributedSampler(
test_set, num_replicas=hvd.size(), rank=hvd.rank())
# setup mini-batch enumerator for both train-set and test-set
train_loader = torch.utils.data.DataLoader(
dataset=train_set, sampler=train_sampler if distributed else None,
batch_size=batch_size, shuffle=False if distributed else True, drop_last=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
dataset=test_set, sampler=test_sampler if distributed else None,
batch_size=batch_size, shuffle=False if distributed else True, drop_last=True, **kwargs)
# Distributed: broadcast parameters to all the processes
if distributed:
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
print("Data Dependent Initialization") if root_process else print("Data Dependent Initialization with ya!")
# data-dependent initialization
warmup(model, device, train_loader, 25, root_process)
# if distributed over multiple GPU's, set-up a barrier ensuring that all the processes have initialized the models
if distributed:
hvd.allreduce_(torch.Tensor(0), name='barrier')
# setup exponential moving average decay (EMA) for the parameters.
# This basically means maintaining two sets of parameters during training/testing:
# 1. parameters that are the result of EMA
# 2. parameters not affected by EMA
# The (1)st parameters are only active during test-time.
ema = modules.EMA(0.999)
with torch.no_grad():
""" Setup for Horovod usage.
Args:
model(MultitaskModel): The MultitaskModel object.
learning_rate(float): Learning rate for the model.
Returns: hvd.DistributedOptimizer: Optimizer to use for computing
gradients and applying updates.
"""
# Horovod: scale learning rate by the number of GPUs.
optimizer = optim.Adam(model.parameters(), lr=learning_rate * hvd.size())
# Horovod: broadcast parameters & optimizer state.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
# Horovod: (optional) compression algorithm.
compression = hvd.Compression.fp16
# Horovod: wrap optimizer with DistributedOptimizer.
optimizer = hvd.DistributedOptimizer(
optimizer,
named_parameters=model.named_parameters(),
compression=compression,
)
return optimizer
p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)
],
"weight_decay": 0.0,
},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=1e-6)
if distributed:
optimizer = hvd.DistributedOptimizer(
optimizer,
named_parameters=self.model.named_parameters(),
backward_passes_per_step=gradient_accumulation_steps,
)
hvd.broadcast_parameters(self.model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
if warmup_proportion:
warmup_steps = t_total * warmup_proportion
else:
warmup_steps = 0
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total)
global_step = 0
tr_loss = 0.0
# self.model.zero_grad()
self.model.train()
train_iterator = trange(int(num_epochs), desc="Epoch")
for _ in train_iterator:
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", mininterval=60)):
batch = tuple(t.to(device) for t in batch)