How to use the horovod.torch.broadcast_optimizer_state function in horovod

To help you get started, we’ve selected a few horovod examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github jzlianglu / pykaldi2 / bin / train_transformer_se.py View on Github external
print("Number of minibatches: {}".format(len(train_dataloader)))

    if not os.path.isdir(args.exp_dir):
        os.makedirs(args.exp_dir)

    # ceate model
    model_config = config["model_config"]
    model = transformer.TransformerAM(model_config["feat_dim"], args.dim_model, args.nheads, args.ff_size, args.nlayers, args.dropout, model_config["label_size"])
    model.cuda()

    # setup the optimizer
    optimizer = th.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

    # Broadcast parameters and opterimizer state from rank 0 to all other processes.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # Add Horovod Distributed Optimizer
    optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters())

    if os.path.isfile(args.seed_model):
        checkpoint = th.load(args.seed_model)                                            
        state_dict = checkpoint['model']                                            
        model.load_state_dict(state_dict)                                           
        print("=> loaded checkpoint '{}' ".format(args.seed_model))                      
    else:
        sys.stderr.write('ERROR: The model file %s does not exist!\n'%(args.seed_model))
        sys.exit(0)      

    HCLG = args.den_dir + "/HCLG.fst"
    words_txt = args.den_dir + "/words.txt"
    silence_phones = args.den_dir + "/phones/silence.csl"
github jzlianglu / pykaldi2 / bin / train_se2.py View on Github external
if not os.path.isdir(args.exp_dir):
        os.makedirs(args.exp_dir)

    # ceate model
    model_config = config["model_config"]
    model = lstm.LSTMAM(model_config["feat_dim"], model_config["label_size"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True)

    model.cuda()

    # setup the optimizer
    optimizer = th.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)

    # Broadcast parameters and opterimizer state from rank 0 to all other processes.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # Add Horovod Distributed Optimizer
    optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters())

    if os.path.isfile(args.seed_model):
        checkpoint = th.load(args.seed_model)                                            
        state_dict = checkpoint['model']                                            
        model.load_state_dict(state_dict)                                           
        print("=> loaded checkpoint '{}' ".format(args.seed_model))                      
    else:
        sys.stderr.write('ERROR: The model file %s does not exist!\n'%(args.seed_model))
        sys.exit(0)      

    HCLG = args.den_dir + "/HCLG.fst"
    words_txt = args.den_dir + "/words.txt"
    silence_phones = args.den_dir + "/phones/silence.csl"
github jzlianglu / pykaldi2 / bin / train_se.py View on Github external
if not os.path.isdir(args.exp_dir):
        os.makedirs(args.exp_dir)

    # ceate model
    model_config = config["model_config"]
    model = lstm.LSTMAM(model_config["feat_dim"], model_config["label_size"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True)

    model.cuda()

    # setup the optimizer
    optimizer = th.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

    # Broadcast parameters and opterimizer state from rank 0 to all other processes.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # Add Horovod Distributed Optimizer
    optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters())

    if os.path.isfile(args.seed_model):
        checkpoint = th.load(args.seed_model)                                            
        state_dict = checkpoint['model']                                            
        model.load_state_dict(state_dict)                                           
        print("=> loaded checkpoint '{}' ".format(args.seed_model))                      
    else:
        sys.stderr.write('ERROR: The model file %s does not exist!\n'%(args.seed_model))
        sys.exit(0)      

    HCLG = args.den_dir + "/HCLG.fst"
    words_txt = args.den_dir + "/words.txt"
    silence_phones = args.den_dir + "/phones/silence.csl"
github jzlianglu / pykaldi2 / bin / train_chain.py View on Github external
if not os.path.isdir(args.exp_dir):
        os.makedirs(args.exp_dir)

    # ceate model
    model_config = config["model_config"]
    model = lstm.LSTMAM(model_config["feat_dim"], model_config["label_size"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True)

    model.cuda()

    # setup the optimizer
    optimizer = th.optim.Adam(model.parameters(), lr=args.lr, amsgrad=True)

    # Broadcast parameters and opterimizer state from rank 0 to all other processes.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # Add Horovod Distributed Optimizer
    optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters())

    if os.path.isfile(args.seed_model):
        checkpoint = th.load(args.seed_model)                                            
        state_dict = checkpoint['model']
        from collections import OrderedDict                  
        new_state_dict = OrderedDict()                       
        for k, v in state_dict.items():                      
            header = k[:7]                                   
            name = k[7:] # remove 'module.' of dataparallel  
            new_state_dict[name]=v                           
        if header == "module.":                              
            model.load_state_dict(new_state_dict)            
        else:
github microsoft / nlp-recipes / scenarios / sentence_similarity / gensen_train.py View on Github external
""" Setup for Horovod usage.

    Args:
        model(MultitaskModel): The MultitaskModel object.
        learning_rate(float): Learning rate for the model.

    Returns: hvd.DistributedOptimizer: Optimizer to use for computing
    gradients and applying updates.

    """
    # Horovod: scale learning rate by the number of GPUs.
    optimizer = optim.Adam(model.parameters(), lr=learning_rate * hvd.size())

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.fp16

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        compression=compression,
    )

    return optimizer
github microsoft / nlp-recipes / utils_nlp / models / bert / question_answering_distributed_v2.py View on Github external
p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=1e-6)

        if distributed:
            optimizer = hvd.DistributedOptimizer(
                optimizer,
                named_parameters=self.model.named_parameters(),
                backward_passes_per_step=gradient_accumulation_steps,
            )

            hvd.broadcast_parameters(self.model.state_dict(), root_rank=0)
            hvd.broadcast_optimizer_state(optimizer, root_rank=0)

        if warmup_proportion:
            warmup_steps = t_total * warmup_proportion
        else:
            warmup_steps = 0

        scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total)

        global_step = 0
        tr_loss = 0.0
        # self.model.zero_grad()
        train_iterator = trange(int(num_epochs), desc="Epoch")
        for _ in train_iterator:
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", mininterval=60)):
                self.model.train()
                batch = tuple(t.to(device) for t in batch)
github jzlianglu / pykaldi2 / bin / train_ce.py View on Github external
# ceate model
    model_config = config["model_config"]
    model = lstm.LSTMStack(model_config["feat_dim"], model_config["label_size"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True)

    # Start training
    th.backends.cudnn.enabled = True
    if th.cuda.is_available():
        model.cuda()

    # optimizer
    optimizer = th.optim.Adam(model.parameters(), lr=args.lr, amsgrad=True)

    if args.hvd:
        # Broadcast parameters and opterimizer state from rank 0 to all other processes.
        hvd.broadcast_parameters(model.state_dict(), root_rank=0)
        hvd.broadcast_optimizer_state(optimizer, root_rank=0)

        # Add Horovod Distributed Optimizer
        optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters())

    # criterion
    criterion = nn.CrossEntropyLoss(ignore_index=-100)

    start_epoch = 0
    if args.resume_from_model:

        assert os.path.isfile(args.resume_from_model), "ERROR: model file {} does not exit!".format(args.resume_from_model)

        checkpoint = th.load(args.resume_from_model)
        state_dict = checkpoint['model']
        start_epoch = checkpoint['epoch']
        model.load_state_dict(state_dict)
github fhkingma / bitswap / model / imagenet_train.py View on Github external
train_set, num_replicas=hvd.size(), rank=hvd.rank())
        test_sampler = torch.utils.data.distributed.DistributedSampler(
            test_set, num_replicas=hvd.size(), rank=hvd.rank())

    # setup mini-batch enumerator for both train-set and test-set
    train_loader = torch.utils.data.DataLoader(
        dataset=train_set, sampler=train_sampler if distributed else None,
        batch_size=batch_size, shuffle=False if distributed else True, drop_last=True, **kwargs)
    test_loader = torch.utils.data.DataLoader(
        dataset=test_set, sampler=test_sampler if distributed else None,
        batch_size=batch_size, shuffle=False if distributed else True, drop_last=True, **kwargs)

    # Distributed: broadcast parameters to all the processes
    if distributed:
        hvd.broadcast_parameters(model.state_dict(), root_rank=0)
        hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    print("Data Dependent Initialization") if root_process else print("Data Dependent Initialization with ya!")
    # data-dependent initialization
    warmup(model, device, train_loader, 25, root_process)

    # if distributed over multiple GPU's, set-up a barrier ensuring that all the processes have initialized the models
    if distributed:
        hvd.allreduce_(torch.Tensor(0), name='barrier')

    # setup exponential moving average decay (EMA) for the parameters.
    # This basically means maintaining two sets of parameters during training/testing:
    # 1. parameters that are the result of EMA
    # 2. parameters not affected by EMA
    # The (1)st parameters are only active during test-time.
    ema = modules.EMA(0.999)
    with torch.no_grad():
github microsoft / nlp-recipes / utils_nlp / model / gensen / train_mlflow.py View on Github external
""" Setup for Horovod usage.

    Args:
        model(MultitaskModel): The MultitaskModel object.
        learning_rate(float): Learning rate for the model.

    Returns: hvd.DistributedOptimizer: Optimizer to use for computing
    gradients and applying updates.

    """
    # Horovod: scale learning rate by the number of GPUs.
    optimizer = optim.Adam(model.parameters(), lr=learning_rate * hvd.size())

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.fp16

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        compression=compression,
    )

    return optimizer
github microsoft / nlp-recipes / utils_nlp / models / bert / question_answering_distributed_v1.py View on Github external
p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=1e-6)

        if distributed:
            optimizer = hvd.DistributedOptimizer(
                optimizer,
                named_parameters=self.model.named_parameters(),
                backward_passes_per_step=gradient_accumulation_steps,
            )

            hvd.broadcast_parameters(self.model.state_dict(), root_rank=0)
            hvd.broadcast_optimizer_state(optimizer, root_rank=0)

        if warmup_proportion:
            warmup_steps = t_total * warmup_proportion
        else:
            warmup_steps = 0

        scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total)

        global_step = 0
        tr_loss = 0.0
        # self.model.zero_grad()
        self.model.train()
        train_iterator = trange(int(num_epochs), desc="Epoch")
        for _ in train_iterator:
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", mininterval=60)):
                batch = tuple(t.to(device) for t in batch)