Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Word-level vocabulary
char_vocab : Vocab
Char-level vocabulary
"""
with open(os.path.join(self._data_root_path, self._processed_train_data_file_name),
'r') as f:
train_examples = json.load(f)
with open(os.path.join(self._data_root_path, self._processed_dev_data_file_name), 'r') as f:
dev_examples = json.load(f)
with open(os.path.join(self._data_root_path, self._word_vocab_file_name), 'r') as f:
word_vocab = Vocab.from_json(json.load(f))
with open(os.path.join(self._data_root_path, self._char_vocab_file_name), 'r') as f:
char_vocab = Vocab.from_json(json.load(f))
return train_examples, dev_examples, word_vocab, char_vocab
else:
vocab = build_vocab(train_dataset)
with open(vocab_path, 'w') as fout:
fout.write(vocab.to_json())
glove = nlp.embedding.create(args.embedding, source=args.embedding_source)
vocab.set_embedding(glove)
train_data_loader = prepare_data_loader(args, train_dataset, vocab)
val_data_loader = prepare_data_loader(args, val_dataset, vocab, test=True)
model = build_model(args, vocab)
train_model(model, train_data_loader, val_data_loader, vocab.embedding, ctx, args)
elif args.mode == 'test':
model_args = argparse.Namespace(**json.load(
open(os.path.join(args.model_dir, 'config.json'))))
vocab = nlp.Vocab.from_json(
open(os.path.join(args.model_dir, 'vocab.jsons')).read())
val_dataset = read_dataset(args, 'test_file')
val_data_loader = prepare_data_loader(args, val_dataset, vocab, test=True)
model = build_model(model_args, vocab)
model.load_parameters(os.path.join(
args.model_dir, 'checkpoints', 'valid_best.params'), ctx=ctx)
loss_func = gluon.loss.SoftmaxCrossEntropyLoss()
logger.info('Test on {}'.format(args.test_file))
loss, acc = test_model(model, val_data_loader, loss_func, ctx)
logger.info('loss={:.4f} acc={:.4f}'.format(loss, acc))
def compare_transformerxl(args, kwargs, corpus):
# Data
np_features, np_labels, batch_size, tgt_len = get_data(args)
# Models
model_p = transformers.TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
model_p.crit.keep_order = True
model_p.transformer.output_attentions = False # no change of default; breaks model if changed
model_p.transformer.output_hidden_states = True
with open(args.gluon_vocab_file, 'r') as f:
vocab = nlp.Vocab.from_json(f.read())
ctx = mx.gpu()
model = TransformerXL(vocab_size=len(vocab), clamp_len=model_p.transformer.clamp_len, **kwargs)
model.initialize(ctx=ctx)
model.load_parameters(args.gluon_parameter_file, ignore_extra=False)
model.hybridize()
# Computation
assert len(np_features) == 2
mems = model.begin_mems(batch_size, model_p.config.mem_len, context=ctx)
mems_p = None
for batch in range(2):
print('Batch {}'.format(batch))
features_nd = mx.nd.array(np_features[batch], ctx=ctx)
labels_nd = mx.nd.array(np_labels[batch], ctx=ctx)
features_p = torch.tensor(np_features[batch], dtype=torch.long)
def vocab(self):
path = os.path.join(self._path, 'vocab.json')
with io.open(path, 'r', encoding='utf-8') as in_file:
return Vocab.from_json(in_file.read())
tensorflow_all_out_features.append(tensorflow_out_features)
output_json['features'] = tensorflow_all_out_features
tensorflow_all_out.append(output_json)
tf_outputs = [tensorflow_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes]
###############################################################################
# Gluon MODEL #
###############################################################################
if args.gluon_parameter_file:
assert args.gluon_vocab_file, \
'Must specify --gluon_vocab_file when specifying --gluon_parameter_file'
with open(args.gluon_vocab_file, 'r') as f:
vocabulary = nlp.Vocab.from_json(f.read())
bert, vocabulary = nlp.model.get_model(args.gluon_model,
dataset_name=None,
vocab=vocabulary,
pretrained=not args.gluon_parameter_file,
use_pooler=False,
use_decoder=False,
use_classifier=False)
try:
bert.cast('float16')
bert.load_parameters(args.gluon_parameter_file, ignore_extra=True)
bert.cast('float32')
except AssertionError:
bert.cast('float32')
bert.load_parameters(args.gluon_parameter_file, ignore_extra=True)
else:
assert not args.gluon_vocab_file, \
json.dump(vars(args), open(os.path.join(args.output_dir, 'config.json'), 'w'))
if args.gpu_id == -1:
ctx = mx.cpu()
else:
ctx = mx.gpu(args.gpu_id)
mx.random.seed(args.seed, ctx=ctx)
if args.mode == 'train':
train_dataset = read_dataset(args, 'train_file')
val_dataset = read_dataset(args, 'test_file')
vocab_path = os.path.join(args.output_dir, 'vocab.jsons')
if os.path.exists(vocab_path):
vocab = nlp.Vocab.from_json(open(vocab_path).read())
else:
vocab = build_vocab(train_dataset)
with open(vocab_path, 'w') as fout:
fout.write(vocab.to_json())
glove = nlp.embedding.create(args.embedding, source=args.embedding_source)
vocab.set_embedding(glove)
train_data_loader = prepare_data_loader(args, train_dataset, vocab)
val_data_loader = prepare_data_loader(args, val_dataset, vocab, test=True)
model = build_model(args, vocab)
train_model(model, train_data_loader, val_data_loader, vocab.embedding, ctx, args)
elif args.mode == 'test':
model_args = argparse.Namespace(**json.load(
open(os.path.join(args.model_dir, 'config.json'))))
vocab = nlp.Vocab.from_json(