Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_mkdir(dirname):
nlp.utils.mkdir(dirname)
assert os.path.isdir(os.path.expanduser(dirname))
type=int,
default=384,
help='The maximum total input sequence length after WordPiece tokenization.'
'Sequences longer than this needs to be truncated, and sequences shorter '
'than this needs to be padded. Default is 384')
parser.add_argument('--dropout',
type=float,
default=0.1,
help='The dropout probability for the classification/regression head.')
args = parser.parse_args()
# create output dir
output_dir = args.output_dir
nlp.utils.mkdir(output_dir)
###############################################################################
# Logging #
###############################################################################
log = logging.getLogger('gluonnlp')
log.setLevel(logging.DEBUG)
formatter = logging.Formatter(fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s',
datefmt='%H:%M:%S')
fh = logging.FileHandler(os.path.join(args.output_dir, 'hybrid_export_bert.log'), mode='w')
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
console.setFormatter(formatter)
log.addHandler(console)
type=int,
default=384,
help='The maximum total input sequence length after WordPiece tokenization.'
'Sequences longer than this needs to be truncated, and sequences shorter '
'than this needs to be padded. Default is 384')
parser.add_argument('--dropout',
type=float,
default=0.1,
help='The dropout probability for the classification/regression head.')
args = parser.parse_args()
# create output dir
output_dir = args.output_dir
nlp.utils.mkdir(output_dir)
###############################################################################
# Logging #
###############################################################################
log = logging.getLogger('gluonnlp')
log.setLevel(logging.DEBUG)
formatter = logging.Formatter(fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s',
datefmt='%H:%M:%S')
fh = logging.FileHandler(os.path.join(args.output_dir, 'hybrid_export_bert.log'), mode='w')
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
console.setFormatter(formatter)
log.addHandler(console)
# initialize classifier
if not args.model_parameters:
model.classifier.initialize(init=initializer, ctx=ctxs)
model.pooler.initialize(init=initializer, ctx=ctxs)
# load checkpointing
output_dir = args.output_dir
if args.model_parameters:
logging.info('loading model params from %s', args.model_parameters)
nlp.utils.load_parameters(model,
args.model_parameters,
ctx=ctxs,
cast_dtype=True)
nlp.utils.mkdir(output_dir)
logging.debug(model)
model.hybridize(static_alloc=True)
loss_function.hybridize(static_alloc=True)
logging.info('processing dataset...')
train_data, dev_data_list, test_data_list, num_train_examples = preprocess_data(
tokenizer, task, args.batch_size, args.dev_batch_size, args.max_len, vocab)
def test(loader_test, segment):
"""Inference function on the test dataset."""
logging.info('Now we are doing testing on %s with %s.', segment, ctxs)
tic = time.time()
results = []
save_parameters(step_num, model, args.ckpt_dir)
mx.nd.waitall()
train_end_time = time.time()
logging.info('Train cost={:.1f}s'.format(train_end_time - train_begin_time))
if __name__ == '__main__':
ctx = [mx.cpu()] if args.gpus is None or args.gpus == '' else \
[mx.gpu(int(x)) for x in args.gpus.split(',')]
model, nsp_loss, mlm_loss, vocab = get_model_loss(ctx, args.model, args.pretrained,
args.dataset_name, None, args.dtype,
ckpt_dir=args.ckpt_dir,
start_step=args.start_step)
store = mx.kv.create(args.kvstore)
nlp.utils.mkdir(args.ckpt_dir)
if args.data:
logging.info('Using training data at {}'.format(args.data))
num_parts = 1 if args.dummy_data_len else store.num_workers
part_idx = 0 if args.dummy_data_len else store.rank
data_train = get_pretrain_data_npz(args.data, args.batch_size, len(ctx), True,
args.use_avg_len, args.num_buckets,
num_parts=num_parts, part_idx=part_idx,
prefetch=not args.dummy_data_len)
train(data_train, model, nsp_loss, mlm_loss, len(vocab), ctx, store)
if args.data_eval:
logging.info('Using evaluation data at {}'.format(args.data_eval))
data_eval = get_pretrain_data_npz(args.data_eval, args.batch_size_eval, len(ctx),
False, False, 1)
evaluate(data_eval, model, nsp_loss, mlm_loss, len(vocab), ctx,
args.log_interval, args.dtype)
dataset_name, vocab, args.dtype,
ckpt_dir=args.ckpt_dir,
start_step=args.start_step)
logging.info('Model created')
data_eval = args.data_eval
if args.raw:
if args.sentencepiece:
tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab,
lower=not args.cased)
else:
tokenizer = nlp.data.BERTTokenizer(vocab=vocab, lower=not args.cased)
cache_dir = os.path.join(args.ckpt_dir, 'data_eval_cache')
cache_file = os.path.join(cache_dir, 'part-000.npz')
nlp.utils.mkdir(cache_dir)
# generate dev dataset from the raw text if needed
if not args.eval_use_npz:
data_eval = cache_file
if not os.path.isfile(cache_file) and rank == 0:
generate_dev_set(tokenizer, vocab, cache_file, args)
logging.debug('Random seed set to %d', random_seed)
mx.random.seed(random_seed)
if args.data:
if args.raw:
get_dataset_fn = functools.partial(get_pretrain_data_text,
max_seq_length=args.max_seq_length,
short_seq_prob=args.short_seq_prob,
masked_lm_prob=args.masked_lm_prob,
ckpt_dir=args.ckpt_dir,
start_step=args.start_step)
logging.debug('Model created')
data_eval = args.data_eval
if args.raw:
if args.sentencepiece:
tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab,
num_best=args.sp_nbest,
alpha=args.sp_alpha, lower=not args.cased)
else:
tokenizer = nlp.data.BERTTokenizer(vocab=vocab, lower=not args.cased)
cache_dir = os.path.join(args.ckpt_dir, 'data_eval_cache')
cache_file = os.path.join(cache_dir, 'part-000.npz')
nlp.utils.mkdir(cache_dir)
# generate dev dataset from the raw text if needed
if not args.eval_use_npz:
data_eval = cache_file
if not os.path.isfile(cache_file) and rank == 0:
generate_dev_set(tokenizer, vocab, cache_file, args)
logging.debug('Random seed set to %d', random_seed)
mx.random.seed(random_seed)
if args.data:
if args.raw:
get_dataset_fn = functools.partial(get_pretrain_data_text,
max_seq_length=args.max_seq_length,
short_seq_prob=args.short_seq_prob,
masked_lm_prob=args.masked_lm_prob,
else:
model = BERTClassifier(bert, dropout=0.1, num_classes=num_classes)
# initialize classifier
if not model_parameters:
model.classifier.initialize(init=initializer, ctx=ctx)
# load checkpointing
output_dir = args.output_dir
if pretrained_bert_parameters:
logging.info('loading bert params from %s', pretrained_bert_parameters)
nlp.utils.load_parameters(model.bert, pretrained_bert_parameters, ctx=ctx, ignore_extra=True,
cast_dtype=True)
if model_parameters:
logging.info('loading model params from %s', model_parameters)
nlp.utils.load_parameters(model, model_parameters, ctx=ctx, cast_dtype=True)
nlp.utils.mkdir(output_dir)
logging.debug(model)
model.hybridize(static_alloc=True)
loss_function.hybridize(static_alloc=True)
if deploy:
logging.info('load symbol file directly as SymbolBlock for model deployment')
model = mx.gluon.SymbolBlock.imports('{}-symbol.json'.format(args.model_prefix),
['data0', 'data1', 'data2'],
'{}-0000.params'.format(args.model_prefix))
model.hybridize(static_alloc=True, static_shape=True)
# data processing
do_lower_case = 'uncased' in dataset
if use_roberta:
bert_tokenizer = nlp.data.GPT2BPETokenizer()
def main():
"""Main function."""
time_start = time.time()
# random seed
random.seed(args.random_seed)
# create output dir
output_dir = os.path.expanduser(args.output_dir)
nlp.utils.mkdir(output_dir)
# vocabulary and tokenizer
if args.sentencepiece:
logging.info('loading vocab file from sentence piece model: %s', args.sentencepiece)
if args.dataset_name:
warnings.warn('Both --dataset_name and --sentencepiece are provided. '
'The vocabulary will be loaded based on --sentencepiece.')
vocab = nlp.vocab.BERTVocab.from_sentencepiece(args.sentencepiece)
tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab, num_best=args.sp_nbest,
alpha=args.sp_alpha, lower=not args.cased)
else:
logging.info('loading vocab file from pre-defined dataset: %s', args.dataset_name)
vocab = nlp.data.utils._load_pretrained_vocab(args.dataset_name, root=output_dir,
cls=nlp.vocab.BERTVocab)
tokenizer = BERTTokenizer(vocab=vocab, lower='uncased' in args.dataset_name)
def main():
"""Main function."""
time_start = time.time()
# random seed
random.seed(args.random_seed)
# create output dir
output_dir = os.path.expanduser(args.output_dir)
nlp.utils.mkdir(output_dir)
# vocabulary and tokenizer
if args.sentencepiece:
logging.info('loading vocab file from sentence piece model: %s', args.sentencepiece)
if args.dataset_name:
warnings.warn('Both --dataset_name and --sentencepiece are provided. '
'The vocabulary will be loaded based on --sentencepiece.')
vocab = nlp.vocab.BERTVocab.from_sentencepiece(args.sentencepiece)
tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab, num_best=args.sp_nbest,
alpha=args.sp_alpha, lower=not args.cased)
else:
logging.info('loading vocab file from pre-defined dataset: %s', args.dataset_name)
vocab = nlp.data.utils._load_pretrained_vocab(args.dataset_name, root=output_dir,
cls=nlp.vocab.BERTVocab)
tokenizer = BERTTokenizer(vocab=vocab, lower='uncased' in args.dataset_name)