Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
trans = BERTDatasetTransform(tokenizer, max_len,
class_labels=task.class_labels,
label_alias=task.label_alias,
pad=pad, pair=task.is_pair,
has_label=True)
# data train
# task.dataset_train returns (segment_name, dataset)
train_tsv = task.dataset_train()[1]
data_train = mx.gluon.data.SimpleDataset(pool.map(trans, train_tsv))
data_train_len = data_train.transform(
lambda input_id, length, segment_id, label_id: length, lazy=False)
# bucket sampler for training
batchify_fn = nlp.data.batchify.Tuple(
nlp.data.batchify.Pad(axis=0), nlp.data.batchify.Stack(),
nlp.data.batchify.Pad(axis=0), nlp.data.batchify.Stack(label_dtype))
batch_sampler = nlp.data.sampler.FixedBucketSampler(
data_train_len,
batch_size=batch_size,
num_buckets=10,
ratio=0,
shuffle=True)
# data loader for training
loader_train = gluon.data.DataLoader(
dataset=data_train,
num_workers=1,
batch_sampler=batch_sampler,
batchify_fn=batchify_fn)
# data dev. For MNLI, more than one dev set is available
dev_tsv = task.dataset_dev()
dev_tsv_list = dev_tsv if isinstance(dev_tsv, list) else [dev_tsv]
else:
xs, ys, ms = [x], [y], [m]
xs = _load(xs)
ys = _load(ys)
ms = _load(ms)
ss = [sampler(y) for y in ys]
ss = _load(ss)
return xs, ys, ms, ss
train_batch_size = args.batch_size * len(context)
train_batchify = nlp.data.batchify.StreamBPTTBatchify(vocab, args.bptt, train_batch_size)
train_data = train_batchify(train_data_stream)
train_data = train_data.transform(_split_and_sample)
test_batch_size = args.batch_size
test_batchify = nlp.data.batchify.StreamBPTTBatchify(vocab, args.bptt, test_batch_size)
test_data = test_batchify(test_data_stream)
test_data = nlp.data.PrefetchingStream(test_data)
###############################################################################
# Build the model
###############################################################################
eval_model = nlp.model.language_model.BigRNN(ntokens, args.emsize, args.nhid,
args.nlayers, args.nproj,
embed_dropout=args.dropout,
encode_dropout=args.dropout)
model = nlp.model.language_model.train.BigRNN(ntokens, args.emsize, args.nhid,
args.nlayers, args.nproj, args.k,
embed_dropout=args.dropout,
encode_dropout=args.dropout)
loss = gluon.loss.SoftmaxCrossEntropyLoss()
get_model_params = {
'name': args.model,
'dataset_name': args.dataset,
'pretrained': get_pretrained,
'ctx': ctx,
'use_decoder': False,
'dropout': args.dropout,
'attention_dropout': args.attention_dropout
}
# model, vocabulary and tokenizer
xlnet_base, vocab, tokenizer = model.get_model(**get_model_params)
batchify_fn = nlp.data.batchify.Tuple(
nlp.data.batchify.Stack('int32'), # example_id
nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], dtype='int32',
round_to=args.round_to), # input_ids
nlp.data.batchify.Pad(axis=0, pad_val=3, dtype='int32', round_to=args.round_to), # segment_ids
nlp.data.batchify.Stack('float32'), # valid_length
nlp.data.batchify.Pad(axis=0, pad_val=1, round_to=args.round_to), # p_mask
nlp.data.batchify.Stack('float32'), # start_position
nlp.data.batchify.Stack('float32'), # end_position
nlp.data.batchify.Stack('float32')) # is_impossible
if pretrained_xlnet_parameters:
# only load XLnetModel parameters
nlp.utils.load_parameters(xlnet_base, pretrained_xlnet_parameters, ctx=ctx, ignore_extra=True,
cast_dtype=True)
units = xlnet_base._net._units
net = XLNetForQA(xlnet_base=xlnet_base, start_top_n=args.start_top_n, end_top_n=args.end_top_n,
elif args.bucket_scheme == 'exp':
bucket_scheme = nlp.data.ExpWidthBucket(bucket_len_step=1.2)
else:
raise NotImplementedError
data_lengths = get_data_lengths(data_set)
if dataset_type == 'train':
train_batchify_fn = btf.Tuple(btf.Pad(pad_val=0), btf.Pad(pad_val=0),
btf.Stack(dtype='float32'), btf.Stack(dtype='float32'))
else:
data_lengths = list(map(lambda x: x[-1], data_lengths))
test_batchify_fn = btf.Tuple(btf.Pad(pad_val=0), btf.Pad(pad_val=0),
btf.Stack(dtype='float32'), btf.Stack(dtype='float32'),
btf.Stack())
batch_sampler = nlp.data.FixedBucketSampler(lengths=data_lengths,
batch_size=(args.batch_size \
if dataset_type == 'train' \
else args.test_batch_size),
num_buckets=args.num_buckets,
ratio=args.bucket_ratio,
shuffle=(dataset_type == 'train'),
use_average_length=use_average_length,
num_shards=num_shards,
bucket_scheme=bucket_scheme)
if dataset_type == 'train':
logging.info('Train Batch Sampler:\n%s', batch_sampler.stats())
data_loader = nlp.data.ShardedDataLoader(data_set,
batch_sampler=batch_sampler,
# TODO(sxjscience) Consider to combine the NamedTuple and batchify functionality.
ChunkFeature = collections.namedtuple('ChunkFeature',
['qas_id',
'data',
'valid_length',
'segment_ids',
'masks',
'is_impossible',
'gt_start',
'gt_end',
'context_offset',
'chunk_start',
'chunk_length'])
BatchifyFunction = bf.NamedTuple(ChunkFeature,
{'qas_id': bf.List(),
'data': bf.Pad(),
'valid_length': bf.Stack(),
'segment_ids': bf.Pad(),
'masks': bf.Pad(val=1),
'is_impossible': bf.Stack(),
'gt_start': bf.Stack(),
'gt_end': bf.Stack(),
'context_offset': bf.Stack(),
'chunk_start': bf.Stack(),
'chunk_length': bf.Stack()})
def __init__(self, tokenizer, doc_stride, max_seq_length, max_query_length):
"""
Parameters
----------
tokenizer
"""Create data loaders for training/validation/test."""
assert dataset_type in ['train', 'val', 'test']
if args.bucket_scheme == 'constant':
bucket_scheme = nlp.data.ConstWidthBucket()
elif args.bucket_scheme == 'linear':
bucket_scheme = nlp.data.LinearWidthBucket()
elif args.bucket_scheme == 'exp':
bucket_scheme = nlp.data.ExpWidthBucket(bucket_len_step=1.2)
else:
raise NotImplementedError
data_lengths = get_data_lengths(data_set)
if dataset_type == 'train':
train_batchify_fn = btf.Tuple(btf.Pad(pad_val=0), btf.Pad(pad_val=0),
btf.Stack(dtype='float32'), btf.Stack(dtype='float32'))
else:
data_lengths = list(map(lambda x: x[-1], data_lengths))
test_batchify_fn = btf.Tuple(btf.Pad(pad_val=0), btf.Pad(pad_val=0),
btf.Stack(dtype='float32'), btf.Stack(dtype='float32'),
btf.Stack())
batch_sampler = nlp.data.FixedBucketSampler(lengths=data_lengths,
batch_size=(args.batch_size \
if dataset_type == 'train' \
else args.test_batch_size),
num_buckets=args.num_buckets,
ratio=args.bucket_ratio,
shuffle=(dataset_type == 'train'),
use_average_length=use_average_length,
trans = partial(convert_examples_to_features, tokenizer=tokenizer,
truncate_length=truncate_length,
cls_token=vocab.cls_token if not use_roberta else vocab.bos_token,
sep_token=vocab.sep_token if not use_roberta else vocab.eos_token,
class_labels=task.class_labels, label_alias=task.label_alias, vocab=vocab)
# data train
# task.dataset_train returns (segment_name, dataset)
train_tsv = task.dataset_train()[1]
data_train = mx.gluon.data.SimpleDataset(list(map(trans, train_tsv)))
data_train_len = data_train.transform(lambda _, segment_ids, valid_length, label: valid_length,
lazy=False)
# bucket sampler for training
pad_val = vocabulary[vocabulary.padding_token]
batchify_fn = nlp.data.batchify.Tuple(
nlp.data.batchify.Pad(axis=0, pad_val=pad_val, round_to=args.round_to), # input
nlp.data.batchify.Pad(axis=0, pad_val=0, round_to=args.round_to), # segment
nlp.data.batchify.Stack(), # length
nlp.data.batchify.Stack(label_dtype)) # label
batch_sampler = nlp.data.sampler.FixedBucketSampler(data_train_len, batch_size=batch_size,
num_buckets=10, ratio=0, shuffle=True)
# data loader for training
loader_train = gluon.data.DataLoader(dataset=data_train, num_workers=4,
batch_sampler=batch_sampler, batchify_fn=batchify_fn)
# data dev. For MNLI, more than one dev set is available
dev_tsv = task.dataset_dev()
dev_tsv_list = dev_tsv if isinstance(dev_tsv, list) else [dev_tsv]
loader_dev_list = []
for segment, data in dev_tsv_list:
data_dev = mx.gluon.data.SimpleDataset(list(map(trans, data)))
loader_dev = mx.gluon.data.DataLoader(data_dev, batch_size=dev_batch_size, num_workers=4,