Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
for pad_index in [[0], [1], [2], [0, 1], [1, 2], [0, 1, 2]]:
shapes = [[[2 for _ in range(ndim)] for _ in range(batch_size)]
for _ in range(TOTAL_ELE_NUM)]
for j in pad_index:
for i in range(batch_size):
shapes[j][i][axis] = np.random.randint(length_min, length_max)
random_data_npy = [tuple(np.random.normal(0, 1, shapes[j][i]).astype(dtype)
for j in range(TOTAL_ELE_NUM)) for i in range(batch_size)]
batchify_fn = []
for j in range(TOTAL_ELE_NUM):
if j in pad_index:
batchify_fn.append(batchify.Pad(axis=axis, pad_val=pad_val, ret_length=True,
dtype=_dtype))
else:
batchify_fn.append(batchify.Stack(dtype=_dtype))
batchify_fn = batchify.Tuple(batchify_fn)
ret_use_npy = batchify_fn(random_data_npy)
with pytest.warns(UserWarning):
# Using Pad with NDArrays is discouraged for speed reasons.
ret_use_mx = batchify_fn([tuple(mx.nd.array(ele[i], dtype=dtype)
for i in range(TOTAL_ELE_NUM))
for ele in random_data_npy])
for i in range(TOTAL_ELE_NUM):
if i in pad_index:
assert ret_use_npy[i][0].dtype == ret_use_mx[i][0].dtype == dtype
assert ret_use_npy[i][1].dtype == ret_use_mx[i][1].dtype == np.int32
assert_allclose(ret_use_npy[i][0].asnumpy(),
ret_use_mx[i][0].asnumpy())
assert_allclose(ret_use_npy[i][1].asnumpy(),
ret_use_mx[i][1].asnumpy())
assert (ret_use_npy[i][1].shape == (batch_size,))
else:
args.tgt_corpus,
sentence_normalizer=tgt_normalizer,
base_tokenizer=base_tgt_tokenizer,
bpe_tokenizer=tgt_tokenizer,
add_bos=True,
add_eos=True
)
else: # when applying inference, populate the fake tgt tokens
all_tgt_token_ids = all_tgt_lines = [[] for i in range(len(all_src_token_ids))]
test_dataloader = gluon.data.DataLoader(
list(zip(all_src_token_ids,
[len(ele) for ele in all_src_token_ids],
all_tgt_token_ids,
[len(ele) for ele in all_tgt_token_ids])),
batch_size=32,
batchify_fn=Tuple(Pad(), Stack(), Pad(), Stack()),
shuffle=False)
ctx = ctx_l[0]
pred_sentences = []
start_eval_time = time.time()
# evaluate
if not args.inference:
avg_nll_loss = 0
ntokens = 0
for i, (src_token_ids, src_valid_length, tgt_token_ids, tgt_valid_length)\
in enumerate(test_dataloader):
src_token_ids = mx.np.array(src_token_ids, ctx=ctx, dtype=np.int32)
src_valid_length = mx.np.array(src_valid_length, ctx=ctx, dtype=np.int32)
tgt_token_ids = mx.np.array(tgt_token_ids, ctx=ctx, dtype=np.int32)
tgt_valid_length = mx.np.array(tgt_valid_length, ctx=ctx, dtype=np.int32)
tgt_pred = model(src_token_ids, src_valid_length, tgt_token_ids[:, :-1],
base_lr = args.lr
lr_scheduler = InverseSquareRootScheduler(warmup_steps=args.warmup_steps, base_lr=base_lr,
warmup_init_lr=args.warmup_init_lr)
trainer = gluon.Trainer(model.collect_params(), 'adam',
{'learning_rate': args.lr, 'beta1': 0.9,
'beta2': 0.98, 'epsilon': 1e-9, 'lr_scheduler': lr_scheduler})
# Load Data
if args.bucket_scheme == 'constant':
bucket_scheme = ConstWidthBucket()
elif args.bucket_scheme == 'linear':
bucket_scheme = LinearWidthBucket()
elif args.bucket_scheme == 'exp':
bucket_scheme = ExpWidthBucket(bucket_len_step=1.2)
else:
raise NotImplementedError
batchify_fn = bf.Tuple(bf.Pad(), bf.Pad(), bf.Stack(), bf.Stack(), bf.Stack())
# TODO(sxjscience) Support auto-bucket-size tuning
train_batch_sampler = FixedBucketSampler(lengths=[(ele[2], ele[3]) for ele in data_train],
batch_size=args.batch_size,
num_buckets=args.num_buckets,
ratio=args.bucket_ratio,
shuffle=True,
use_average_length=True,
bucket_scheme=bucket_scheme,
seed=args.seed)
train_data_loader = gluon.data.DataLoader(data_train,
batch_sampler=train_batch_sampler,
batchify_fn=batchify_fn,
num_workers=0)
logging.info(train_batch_sampler)
val_data_loader = gluon.data.DataLoader(data_val,
batch_size=args.val_batch_size,
print(' #Intent = {}'.format(len(intent_vocab)))
print(' #Slot = {}'.format(len(slot_vocab)))
# Display An Example
print('Display A Samples')
print_sample(test_data, 1)
print('-' * 80)
idsl_transform = IDSLSubwordTransform(subword_vocab=bert_vocab,
subword_tokenizer=tokenizer,
slot_vocab=slot_vocab,
cased=args.cased)
train_data_bert = train_data.transform(idsl_transform, lazy=False)
dev_data_bert = dev_data.transform(idsl_transform, lazy=False)
test_data_bert = test_data.transform(idsl_transform, lazy=False)
# Construct the DataLoader
batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(pad_val=0), # Subword ID
nlp.data.batchify.Pad(pad_val=0), # Subword Mask
nlp.data.batchify.Pad(pad_val=0), # Beginning of subword
nlp.data.batchify.Pad(pad_val=0), # Tag IDs
nlp.data.batchify.Stack(), # Intent Label
nlp.data.batchify.Stack()) # Valid Length
train_batch_sampler = nlp.data.sampler.SortedBucketSampler(
[len(ele) for ele in train_data_bert],
batch_size=args.batch_size,
mult=20,
shuffle=True)
train_loader = gluon.data.DataLoader(dataset=train_data_bert,
num_workers=4,
batch_sampler=train_batch_sampler,
batchify_fn=batchify_fn)
dev_loader = gluon.data.DataLoader(dataset=dev_data_bert,
num_workers=4,
def get_dataloader(dataset, batch_size, is_train=True):
# Construct the DataLoader Pad data, stack label and lengths
batchify_fn = nlp.data.batchify.Tuple(
nlp.data.batchify.Pad(axis=0),
nlp.data.batchify.Stack())
dataloader = None
# dataloader for training
if is_train:
data_lengths = [len(sample[0]) for sample in dataset]
# n this example, we use a FixedBucketSampler,
# which assigns each data sample to a fixed bucket based on its length.
batch_sampler = nlp.data.sampler.FixedBucketSampler(
data_lengths,
batch_size=batch_size,
num_buckets=10,
ratio=0.2,
trans = BERTDatasetTransform(tokenizer, max_len,
vocab=vocab,
class_labels=task.class_labels,
label_alias=task.label_alias,
pad=pad, pair=task.is_pair,
has_label=True)
# data train
# task.dataset_train returns (segment_name, dataset)
train_tsv = task.dataset_train()[1]
data_train = mx.gluon.data.SimpleDataset(pool.map(trans, train_tsv))
data_train_len = data_train.transform(
lambda input_id, length, segment_id, label_id: length, lazy=False)
# bucket sampler for training
pad_val = vocab[vocab.padding_token]
batchify_fn = nlp.data.batchify.Tuple(
nlp.data.batchify.Pad(axis=0, pad_val=pad_val), # input
nlp.data.batchify.Stack(), # length
nlp.data.batchify.Pad(axis=0, pad_val=0), # segment
nlp.data.batchify.Stack(label_dtype)) # label
batch_sampler = nlp.data.sampler.FixedBucketSampler(
data_train_len,
batch_size=batch_size,
num_buckets=10,
ratio=0,
shuffle=True)
# data loader for training
loader_train = gluon.data.DataLoader(
dataset=data_train,
num_workers=num_workers,
batch_sampler=batch_sampler,
batchify_fn=batchify_fn)
def prepare_data_loader(args, dataset, vocab, test=False):
"""
Read data and build data loader.
"""
# Preprocess
dataset = dataset.transform(lambda s1, s2, label: (vocab(s1), vocab(s2), label),
lazy=False)
# Batching
batchify_fn = btf.Tuple(btf.Pad(pad_val=0), btf.Pad(pad_val=0), btf.Stack(dtype='int32'))
data_lengths = [max(len(d[0]), len(d[1])) for d in dataset]
batch_sampler = nlp.data.FixedBucketSampler(lengths=data_lengths,
batch_size=args.batch_size,
shuffle=(not test))
data_loader = gluon.data.DataLoader(dataset=dataset,
batch_sampler=batch_sampler,
batchify_fn=batchify_fn)
return data_loader
get_pretrained = True
get_model_params = {
'name': args.model,
'dataset_name': args.dataset,
'pretrained': get_pretrained,
'ctx': ctx,
'use_decoder': False,
'dropout': args.dropout,
'attention_dropout': args.attention_dropout
}
# model, vocabulary and tokenizer
xlnet_base, vocab, tokenizer = model.get_model(**get_model_params)
batchify_fn = nlp.data.batchify.Tuple(
nlp.data.batchify.Stack('int32'), # example_id
nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], dtype='int32',
round_to=args.round_to), # input_ids
nlp.data.batchify.Pad(axis=0, pad_val=3, dtype='int32', round_to=args.round_to), # segment_ids
nlp.data.batchify.Stack('float32'), # valid_length
nlp.data.batchify.Pad(axis=0, pad_val=1, round_to=args.round_to), # p_mask
nlp.data.batchify.Stack('float32'), # start_position
nlp.data.batchify.Stack('float32'), # end_position
nlp.data.batchify.Stack('float32')) # is_impossible
if pretrained_xlnet_parameters:
# only load XLnetModel parameters
nlp.utils.load_parameters(xlnet_base, pretrained_xlnet_parameters, ctx=ctx, ignore_extra=True,
cast_dtype=True)
units = xlnet_base._net._units
"""
num_files = len(nlp.utils.glob(data))
logging.info('%d files are found.', num_files)
assert num_files >= num_parts, \
'The number of text files must be no less than the number of ' \
'workers/partitions (%d). Only %d files at %s are found.'%(num_parts, num_files, data)
dataset_params = {'tokenizer': tokenizer, 'max_seq_length': max_seq_length,
'short_seq_prob': short_seq_prob, 'masked_lm_prob': masked_lm_prob,
'max_predictions_per_seq': max_predictions_per_seq, 'vocab':vocab,
'whole_word_mask': whole_word_mask}
sampler_params = {'batch_size': batch_size, 'shuffle': shuffle,
'num_ctxes': num_ctxes, 'num_buckets': num_buckets}
dataset_fn = prepare_pretrain_text_dataset
sampler_fn = prepare_pretrain_bucket_sampler
pad_val = vocab[vocab.padding_token]
batchify_fn = nlp.data.batchify.Tuple(
nlp.data.batchify.Pad(pad_val=pad_val, round_to=8), # input_id
nlp.data.batchify.Pad(pad_val=pad_val), # masked_id
nlp.data.batchify.Pad(pad_val=0), # masked_position
nlp.data.batchify.Pad(pad_val=0), # masked_weight
nlp.data.batchify.Stack(), # next_sentence_label
nlp.data.batchify.Pad(pad_val=0, round_to=8), # segment_id
nlp.data.batchify.Stack())
split_sampler = nlp.data.SplitSampler(num_files, num_parts=num_parts,
part_index=part_idx, repeat=repeat)
dataloader = nlp.data.DatasetLoader(data,
file_sampler=split_sampler,
dataset_fn=dataset_fn,
batch_sampler_fn=sampler_fn,
dataset_params=dataset_params,
batch_sampler_params=sampler_params,
batchify_fn=batchify_fn,
we pop a cached processed dataset.
num_max_dataset_cached : int, default is 0
Maximum number of cached datasets. It is valid only if dataset_cached is True
"""
num_files = len(nlp.utils.glob(data))
logging.info('%d files are found.', num_files)
assert num_files >= num_parts, \
'The number of text files must be no less than the number of ' \
'workers/partitions (%d). Only %d files at %s are found.'%(num_parts, num_files, data)
dataset_params = {'allow_pickle': True}
sampler_params = {'batch_size': batch_size, 'shuffle': shuffle,
'num_ctxes': num_ctxes, 'num_buckets': num_buckets}
dataset_fn = prepare_pretrain_npz_dataset
sampler_fn = prepare_pretrain_bucket_sampler
pad_val = vocab[vocab.padding_token]
batchify_fn = nlp.data.batchify.Tuple(
nlp.data.batchify.Pad(pad_val=pad_val, round_to=8), # input_id
nlp.data.batchify.Pad(pad_val=pad_val), # masked_id
nlp.data.batchify.Pad(pad_val=0), # masked_position
nlp.data.batchify.Pad(pad_val=0), # masked_weight
nlp.data.batchify.Stack(), # next_sentence_label
nlp.data.batchify.Pad(pad_val=0, round_to=8), # segment_id
nlp.data.batchify.Stack())
split_sampler = nlp.data.SplitSampler(num_files, num_parts=num_parts,
part_index=part_idx, repeat=repeat)
dataloader = nlp.data.DatasetLoader(data,
file_sampler=split_sampler,
dataset_fn=dataset_fn,
batch_sampler_fn=sampler_fn,
dataset_params=dataset_params,
batch_sampler_params=sampler_params,
batchify_fn=batchify_fn,