Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
dev_tsv = _task.dataset_dev()
dev_tsv_list = dev_tsv if isinstance(dev_tsv, list) else [dev_tsv]
loader_dev_list = []
for segment, data in dev_tsv_list:
data_dev = mx.gluon.data.SimpleDataset(list(map(trans, data)))
loader_dev = mx.gluon.data.DataLoader(data_dev,
batch_size=dev_batch_size,
num_workers=4,
shuffle=False,
batchify_fn=batchify_fn)
loader_dev_list.append((segment, loader_dev))
# batchify for data test
test_batchify_fn = nlp.data.batchify.Tuple(
nlp.data.batchify.Pad(axis=0, pad_val=pad_val, round_to=args.round_to),
nlp.data.batchify.Stack(),
nlp.data.batchify.Pad(axis=0, pad_val=0, round_to=args.round_to))
# transform for data test
test_trans = partial(convert_examples_to_features,
tokenizer=_tokenizer,
truncate_length=max_len,
cls_token=_vocab.cls_token,
sep_token=_vocab.sep_token,
class_labels=None,
is_test=True,
vocab=_vocab)
# data test. For MNLI, more than one test set is available
test_tsv = _task.dataset_test()
test_tsv_list = test_tsv if isinstance(test_tsv, list) else [test_tsv]
loader_test_list = []
print_sample(test_data, 1)
print('-' * 80)
idsl_transform = IDSLSubwordTransform(subword_vocab=bert_vocab,
subword_tokenizer=tokenizer,
slot_vocab=slot_vocab,
cased=args.cased)
train_data_bert = train_data.transform(idsl_transform, lazy=False)
dev_data_bert = dev_data.transform(idsl_transform, lazy=False)
test_data_bert = test_data.transform(idsl_transform, lazy=False)
# Construct the DataLoader
batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(pad_val=0), # Subword ID
nlp.data.batchify.Pad(pad_val=0), # Subword Mask
nlp.data.batchify.Pad(pad_val=0), # Beginning of subword
nlp.data.batchify.Pad(pad_val=0), # Tag IDs
nlp.data.batchify.Stack(), # Intent Label
nlp.data.batchify.Stack()) # Valid Length
train_batch_sampler = nlp.data.sampler.SortedBucketSampler(
[len(ele) for ele in train_data_bert],
batch_size=args.batch_size,
mult=20,
shuffle=True)
train_loader = gluon.data.DataLoader(dataset=train_data_bert,
num_workers=4,
batch_sampler=train_batch_sampler,
batchify_fn=batchify_fn)
dev_loader = gluon.data.DataLoader(dataset=dev_data_bert,
num_workers=4,
batch_size=args.batch_size,
batchify_fn=batchify_fn,
shuffle=False)
test_loader = gluon.data.DataLoader(dataset=test_data_bert,
use_pooler=False,
use_decoder=False,
use_classifier=False)
if args.sentencepiece:
tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab, lower=lower)
else:
tokenizer = nlp.data.BERTTokenizer(vocab=vocab, lower=lower)
batchify_fn = nlp.data.batchify.Tuple(
nlp.data.batchify.Stack(),
nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),
nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),
nlp.data.batchify.Stack('float32'),
nlp.data.batchify.Stack('float32'),
nlp.data.batchify.Stack('float32'))
net = BertForQA(bert=bert)
if model_parameters:
# load complete BertForQA parameters
net.load_parameters(model_parameters, ctx=ctx, cast_dtype=True)
elif pretrained_bert_parameters:
# only load BertModel parameters
bert.load_parameters(pretrained_bert_parameters, ctx=ctx,
ignore_extra=True, cast_dtype=True)
net.span_classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
elif pretrained:
# only load BertModel parameters
net.span_classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
else:
# no checkpoint is loaded
net.initialize(init=mx.init.Normal(0.02), ctx=ctx)
assert dataset_type in ['train', 'val', 'test']
if args.bucket_scheme == 'constant':
bucket_scheme = nlp.data.ConstWidthBucket()
elif args.bucket_scheme == 'linear':
bucket_scheme = nlp.data.LinearWidthBucket()
elif args.bucket_scheme == 'exp':
bucket_scheme = nlp.data.ExpWidthBucket(bucket_len_step=1.2)
else:
raise NotImplementedError
data_lengths = get_data_lengths(data_set)
if dataset_type == 'train':
train_batchify_fn = btf.Tuple(btf.Pad(pad_val=0), btf.Pad(pad_val=0),
btf.Stack(dtype='float32'), btf.Stack(dtype='float32'))
else:
data_lengths = list(map(lambda x: x[-1], data_lengths))
test_batchify_fn = btf.Tuple(btf.Pad(pad_val=0), btf.Pad(pad_val=0),
btf.Stack(dtype='float32'), btf.Stack(dtype='float32'),
btf.Stack())
batch_sampler = nlp.data.FixedBucketSampler(lengths=data_lengths,
batch_size=(args.batch_size \
if dataset_type == 'train' \
else args.test_batch_size),
num_buckets=args.num_buckets,
ratio=args.bucket_ratio,
shuffle=(dataset_type == 'train'),
use_average_length=use_average_length,
num_shards=num_shards,
class_labels=task.class_labels,
label_alias=task.label_alias,
pad=pad, pair=task.is_pair,
has_label=True)
# data train
# task.dataset_train returns (segment_name, dataset)
train_tsv = task.dataset_train()[1]
data_train = mx.gluon.data.SimpleDataset(pool.map(trans, train_tsv))
data_train_len = data_train.transform(
lambda input_id, length, segment_id, label_id: length, lazy=False)
# bucket sampler for training
pad_val = vocab[vocab.padding_token]
batchify_fn = nlp.data.batchify.Tuple(
nlp.data.batchify.Pad(axis=0, pad_val=pad_val), # input
nlp.data.batchify.Stack(), # length
nlp.data.batchify.Pad(axis=0, pad_val=0), # segment
nlp.data.batchify.Stack(label_dtype)) # label
batch_sampler = nlp.data.sampler.FixedBucketSampler(
data_train_len,
batch_size=batch_size,
num_buckets=10,
ratio=0,
shuffle=True)
# data loader for training
loader_train = gluon.data.DataLoader(
dataset=data_train,
num_workers=num_workers,
batch_sampler=batch_sampler,
batchify_fn=batchify_fn)
# data dev. For MNLI, more than one dev set is available
# data train
# task.dataset_train returns (segment_name, dataset)
train_tsv = _task.dataset_train()[1]
data_train = list(map(trans, train_tsv))
data_train = mx.gluon.data.SimpleDataset(data_train)
data_train_len = data_train.transform(
lambda _, valid_length, segment_ids, label: valid_length, lazy=False)
# bucket sampler for training
pad_val = _vocab[_vocab.padding_token]
batchify_fn = nlp.data.batchify.Tuple(
nlp.data.batchify.Pad(axis=0, pad_val=pad_val, round_to=args.round_to), # input
nlp.data.batchify.Stack(), # length
nlp.data.batchify.Pad(axis=0, pad_val=4, round_to=args.round_to), # segment
nlp.data.batchify.Stack(label_dtype)) # label
batch_sampler = nlp.data.sampler.FixedBucketSampler(data_train_len,
batch_size=batch_size,
num_buckets=10,
ratio=0,
shuffle=True)
# data loader for training
loader_train = gluon.data.DataLoader(dataset=data_train,
num_workers=4,
batch_sampler=batch_sampler,
batchify_fn=batchify_fn)
# data dev. For MNLI, more than one dev set is available
dev_tsv = _task.dataset_dev()
dev_tsv_list = dev_tsv if isinstance(dev_tsv, list) else [dev_tsv]
loader_dev_list = []
for segment, data in dev_tsv_list:
name=model_name,
dataset_name=dataset_name,
vocab=vocab,
pretrained=pretrained,
ctx=ctx,
use_pooler=False,
use_decoder=False,
use_classifier=False)
if args.sentencepiece:
tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab, lower=lower)
else:
tokenizer = nlp.data.BERTTokenizer(vocab=vocab, lower=lower)
batchify_fn = nlp.data.batchify.Tuple(
nlp.data.batchify.Stack(),
nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),
nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),
nlp.data.batchify.Stack('float32'),
nlp.data.batchify.Stack('float32'),
nlp.data.batchify.Stack('float32'))
net = BertForQA(bert=bert)
if model_parameters:
# load complete BertForQA parameters
net.load_parameters(model_parameters, ctx=ctx, cast_dtype=True)
elif pretrained_bert_parameters:
# only load BertModel parameters
bert.load_parameters(pretrained_bert_parameters, ctx=ctx,
ignore_extra=True, cast_dtype=True)
net.span_classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
elif pretrained:
def get_dataloader(dataset, batch_size, is_train=True):
# Construct the DataLoader Pad data, stack label and lengths
batchify_fn = nlp.data.batchify.Tuple(
nlp.data.batchify.Pad(axis=0),
nlp.data.batchify.Stack())
dataloader = None
# dataloader for training
if is_train:
data_lengths = [len(sample[0]) for sample in dataset]
# n this example, we use a FixedBucketSampler,
# which assigns each data sample to a fixed bucket based on its length.
batch_sampler = nlp.data.sampler.FixedBucketSampler(
data_lengths,
batch_size=batch_size,
num_buckets=10,
ratio=0.2,
shuffle=True)
dataloader = gluon.data.DataLoader(