Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
normal_dl = nemo_asr.AudioToTextDataLayer(
featurizer_config=self.featurizer_config,
manifest_filepath=self.manifest_filepath,
labels=self.labels,
batch_size=batch_size,
placement=DeviceType.GPU,
drop_last=True,
shuffle=False
)
trimmed_dl = nemo_asr.AudioToTextDataLayer(
featurizer_config=self.featurizer_config,
manifest_filepath=self.manifest_filepath,
trim_silence=True,
labels=self.labels,
batch_size=batch_size,
placement=DeviceType.GPU,
drop_last=True,
shuffle=False
)
for norm, trim in zip(normal_dl.data_iterator,
trimmed_dl.data_iterator):
for point in range(batch_size):
self.assertTrue(norm[1][point].data >= trim[1][point].data)
return "{0}-lr_{1}-bs_{2}x{3}-e_{4}-wd_{5}-OPT-{6}".format(name, lr,
batch_size,
num_gpus,
num_epochs, wd,
opt_level)
name = construct_name('Jasper10x5', lr, batch_size, num_gpus, num_epochs,
weight_decay)
tb_writer = SummaryWriter(name)
if args.local_rank is not None:
device = nemo.core.DeviceType.AllGpu
print('Doing ALL GPU')
else:
device = nemo.core.DeviceType.GPU
# instantiate Neural Factory with supported backend
neural_factory = nemo.core.NeuralModuleFactory(
backend=nemo.core.Backend.PyTorch,
local_rank=args.local_rank,
optimization_level=opt_level,
placement=device)
jasper_model_definition = toml.load("../../examples/nemo_asr/jasper10x5.toml")
jasper_model_definition['placement'] = device
labels = jasper_model_definition['labels']['labels']
train_manifest = "/mnt/D1/Data/librispeech/librivox-train-all.json"
#train_manifest = args.train_manifest
val_manifest1 = "/mnt/D1/Data/librispeech/librivox-dev-clean.json"
# val_manifest2 = "/mnt/D1/Data/librispeech/librivox-dev-other.json"
'--beta_step', type=float,
help='step for word count weight\'s tuning in \'eval\' mode',
required=False, default=0.1)
parser.add_argument(
"--beam_width", default=128, type=int)
args = parser.parse_args()
batch_size = args.batch_size
load_dir = args.load_dir
if args.local_rank is not None:
if args.lm_path:
raise NotImplementedError(
"Beam search decoder with LM does not currently support "
"evaluation on multi-gpu.")
device = nemo.core.DeviceType.AllGpu
else:
device = nemo.core.DeviceType.GPU
# Instantiate Neural Factory with supported backend
neural_factory = nemo.core.NeuralModuleFactory(
backend=nemo.core.Backend.PyTorch,
local_rank=args.local_rank,
optimization_level=nemo.core.Optimization.mxprO1,
placement=device)
logger = neural_factory.logger
if args.local_rank is not None:
logger.info('Doing ALL GPU')
yaml = YAML(typ="safe")
with open(args.model_config) as f:
def get_cuda_device(placement):
"""
Converts NeMo nemo.core.DeviceType to torch.device
Args:
placement: nemo.core.DeviceType
Returns:
torch.device
"""
gpu_devices = [nemo.core.DeviceType.GPU, nemo.core.DeviceType.AllGpu]
return torch.device("cuda" if placement in gpu_devices else "cpu")
parser.add_argument("--max_sequence_length", default=256, type=int)
parser.add_argument("--label_smoothing", default=0.1, type=float)
parser.add_argument("--beam_size", default=4, type=int)
parser.add_argument("--tokenizer_model", default="vocab.txt", type=str)
parser.add_argument("--predict_last_k", default=16, type=int)
parser.add_argument("--interactive", action="store_true")
args = parser.parse_args()
# create TensorboardX logger to log training statistics
name = f"transformer-lm-lr_{args.lr}-optim_{args.optimizer}-" \
f"warmup_{args.warmup_steps}-bs_{args.batch_size}"
tb_writer = None # SummaryWriter(name)
# instantiate Neural Factory with supported backend
device = nemo.core.DeviceType.AllGpu if args.local_rank is not None \
else nemo.core.DeviceType.GPU
neural_factory = nemo.core.NeuralModuleFactory(
backend=nemo.core.Backend.PyTorch,
local_rank=args.local_rank,
optimization_level=nemo.core.Optimization.mxprO2,
placement=device)
# define tokenizer, in this example we use word-level tokenizer
# we also adjust the vocabulary size to make it multiple of 8 to accelerate
# training in fp16 mode with the use of Tensor Cores
tokenizer = nemo_nlp.WordTokenizer(f"{args.data_root}/{args.tokenizer_model}")
vocab_size = 8 * math.ceil(tokenizer.vocab_size / 8)
# instantiate necessary modules for the whole translation pipeline, namely
# data layers, encoder, decoder, output log_softmax, beam_search_translator
# and loss function
train_data_layer = nemo_nlp.LanguageModelingDataLayer(
def __init__(self, path, labels, eos_id, pad_id,
batch_size, drop_last=False, num_workers=0,
**kwargs):
super().__init__(**kwargs)
self._dataset = TextDataset(path, labels, eos_id)
if self._placement == DeviceType.AllGpu:
sampler = DistributedSampler(self._dataset)
else:
sampler = None
# noinspection PyTypeChecker
self._dataloader = DataLoader(
dataset=self._dataset,
batch_size=batch_size,
collate_fn=partial(self._collate_fn, pad_id=pad_id, pad8=True),
drop_last=drop_last,
shuffle=sampler is None,
sampler=sampler,
num_workers=num_workers
)
data_file = os.path.join(args.data_dir, "train.txt")
if not os.path.isfile(data_file):
raise FileNotFoundError("CoNLL-2003 dataset not found. Dataset can be "
+ "obtained at https://github.com/kyzhouhzau/BERT"
+ "-NER/tree/master/data and should be put in a "
+ "folder at the same level as ner.py.")
try:
import tensorboardX
tb_writer = tensorboardX.SummaryWriter(args.tensorboard_filename)
except ModuleNotFoundError:
tb_writer = None
print("Tensorboard is not available.")
if args.local_rank is not None:
device = nemo.core.DeviceType.AllGpu
else:
device = nemo.core.DeviceType.GPU
if args.mixed_precision is True:
optimization_level = nemo.core.Optimization.mxprO1
else:
optimization_level = nemo.core.Optimization.mxprO0
# Instantiate Neural Factory with supported backend
neural_factory = nemo.core.NeuralModuleFactory(
backend=nemo.core.Backend.PyTorch,
local_rank=args.local_rank,
optimization_level=optimization_level,
placement=device)
if args.bert_checkpoint is None:
tokenizer_tgt,
dataset_src,
dataset_tgt,
tokens_in_batch=1024,
clean=False,
dataset_type=TranslationDataset,
**kwargs):
dataset_params = {'tokenizer_src': tokenizer_src,
'tokenizer_tgt': tokenizer_tgt,
'dataset_src': dataset_src,
'dataset_tgt': dataset_tgt,
'tokens_in_batch': tokens_in_batch,
'clean': clean}
super().__init__(dataset_type, dataset_params, **kwargs)
if self._placement == nemo.core.DeviceType.AllGpu:
sampler = pt_data.distributed.DistributedSampler(self._dataset)
else:
sampler = None
self._dataloader = pt_data.DataLoader(dataset=self._dataset,
batch_size=1,
collate_fn=self._collate_fn,
shuffle=sampler is None,
sampler=sampler)
raise FileNotFoundError("CoNLL-2003 dataset not found. Dataset can be "
+ "obtained at https://github.com/kyzhouhzau/BERT"
+ "-NER/tree/master/data and should be put in a "
+ "folder at the same level as ner.py.")
try:
import tensorboardX
tb_writer = tensorboardX.SummaryWriter(args.tensorboard_filename)
except ModuleNotFoundError:
tb_writer = None
print("Tensorboard is not available.")
if args.local_rank is not None:
device = nemo.core.DeviceType.AllGpu
else:
device = nemo.core.DeviceType.GPU
if args.mixed_precision is True:
optimization_level = nemo.core.Optimization.mxprO1
else:
optimization_level = nemo.core.Optimization.mxprO0
# Instantiate Neural Factory with supported backend
neural_factory = nemo.core.NeuralModuleFactory(
backend=nemo.core.Backend.PyTorch,
local_rank=args.local_rank,
optimization_level=optimization_level,
placement=device)
if args.bert_checkpoint is None:
tokenizer = NemoBertTokenizer(args.pretrained_bert_model)
parser.add_argument("--attn_layer_dropout", default=0.2, type=float)
parser.add_argument("--max_sequence_length", default=256, type=int)
parser.add_argument("--label_smoothing", default=0.1, type=float)
parser.add_argument("--beam_size", default=4, type=int)
parser.add_argument("--tokenizer_model", default="vocab.txt", type=str)
parser.add_argument("--predict_last_k", default=16, type=int)
parser.add_argument("--interactive", action="store_true")
args = parser.parse_args()
# create TensorboardX logger to log training statistics
name = f"transformer-lm-lr_{args.lr}-optim_{args.optimizer}-" \
f"warmup_{args.warmup_steps}-bs_{args.batch_size}"
tb_writer = None # SummaryWriter(name)
# instantiate Neural Factory with supported backend
device = nemo.core.DeviceType.AllGpu if args.local_rank is not None \
else nemo.core.DeviceType.GPU
neural_factory = nemo.core.NeuralModuleFactory(
backend=nemo.core.Backend.PyTorch,
local_rank=args.local_rank,
optimization_level=nemo.core.Optimization.mxprO2,
placement=device)
# define tokenizer, in this example we use word-level tokenizer
# we also adjust the vocabulary size to make it multiple of 8 to accelerate
# training in fp16 mode with the use of Tensor Cores
tokenizer = nemo_nlp.WordTokenizer(f"{args.data_root}/{args.tokenizer_model}")
vocab_size = 8 * math.ceil(tokenizer.vocab_size / 8)
# instantiate necessary modules for the whole translation pipeline, namely
# data layers, encoder, decoder, output log_softmax, beam_search_translator
# and loss function