Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
from nntoolbox.utils import get_device
from nntoolbox.sequence.models import LanguageModel
from nntoolbox.sequence.learner import LanguageModelLearner
from nntoolbox.sequence.components import AdditiveContextEmbedding
from nntoolbox.sequence.utils import load_embedding
from torch import nn
from torch.optim import Adam
import torch
from nntoolbox.callbacks import *
from nntoolbox.metrics import *
MAX_VOCAB_SIZE = 25000
BATCH_SIZE = 16
TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)
# train_iterator, val_iterator, test_iterator = WikiText2.iters()
# for tmp in train_iterator:
# print(tmp)
train_data, val_data, test_data = WikiText2.splits(TEXT)
train_iterator = data.BPTTIterator(
train_data,
batch_size=BATCH_SIZE,
sort_within_batch=True,
device=get_device(),
bptt_len=35,
shuffle=True
)
print("WARNING: You have CUDA but not using it.")
if torch.cuda.is_available() and args.cuda:
torch.cuda.set_device(args.gpu)
torch.cuda.manual_seed(args.seed)
if not args.trained_model:
print("ERROR: You need to provide a option 'trained_model' path to load the model.")
sys.exit(1)
# ---- get the Field, Dataset, Iterator for train/dev/test sets -----
tokenizer = TreebankWordTokenizer()
def tokenize_text():
return lambda text: tokenizer.tokenize(text)
questions = data.Field(lower=True, tokenize=tokenize_text())
relations = data.Field(sequential=False)
train, dev, test = SimpleQaRelationDataset.splits(questions, relations)
train_iter, dev_iter, test_iter = SimpleQaRelationDataset.iters(args, questions, relations, train, dev, test, shuffleTrain=False)
# load the model
config = args
config.n_embed = len(questions.vocab) # vocab. size / number of embeddings
config.d_out = len(relations.vocab)
config.n_cells = config.n_layers
# double the number of cells for bidirectional networks
if config.birnn:
config.n_cells *= 2
print(config)
model = RelationClassifier(config)
sort_key=lambda x: data.interleave_keys(len(x.src), len(x.trg)),
repeat=False, shuffle=True, device=device)
):
'''
DataLoader initialization.
:param train_fn: Train-set filename
:param batch_size: Batchify data fot certain batch size.
:param device: Device-id to load data (-1 for CPU)
:param max_vocab: Maximum vocabulary size
:param min_freq: Minimum frequency for loaded word.
:param use_eos: If it is True, put after every end of sentence.
:param shuffle: If it is True, random shuffle the input data.
'''
super().__init__()
# Define field of the input file.
# The input file consists of two fields.
self.label = data.Field(
sequential=False,
use_vocab=True,
unk_token=None
)
self.text = data.Field(
use_vocab=True,
batch_first=True,
include_lengths=False,
eos_token='' if use_eos else None
)
# Those defined two columns will be delimited by TAB.
# Thus, we use TabularDataset to load two columns in the input file.
# We would have two separate input file: train_fn, valid_fn
# Files consist of two columns: label field and text field.
train, valid = data.TabularDataset(
def _construct_example_fromlist(self, data, fields):
"""
Args:
data: the data to be set as the value of the attributes of
the to-be-created `Example`, associating with respective
`Field` objects with same key.
fields: a dict of `torchtext.data.Field` objects. The keys
are attributes of the to-be-created `Example`.
Returns:
the created `Example` object.
"""
ex = torchtext.data.Example()
for (name, field), val in zip(fields, data):
if field is not None:
setattr(ex, name, field.preprocess(val))
else:
setattr(ex, name, val)
return ex
def dyn_batch_without_padding(new, i, sofar):
if args.distillation:
return sofar + max(len(new.src), len(new.trg), len(new.dec))
else:
return sofar + max(len(new.src), len(new.trg))
if args.batch_size == 1: # speed-test: one sentence per batch.
batch_size_fn = lambda new, count, sofar: count
else:
batch_size_fn = dyn_batch_with_padding # dyn_batch_without_padding
train_real, dev_real = data.BucketIterator.splits(
(train_data, dev_data), batch_sizes=(args.batch_size, args.valid_batch_size), device=args.gpu, shuffle=False,
batch_size_fn=batch_size_fn, repeat=None if args.mode == 'train' else False)
aux_reals = [data.BucketIterator(dataset, batch_size=args.batch_size, device=args.gpu, train=True, batch_size_fn=batch_size_fn, shuffle=False)
for dataset in aux_data]
logger.info("build the dataset. done!")
# ----------------------------------------------------------------------------------------------------------------- #
# model hyper-params:
logger.info('use default parameters of t2t-base')
hparams = {'d_model': 512, 'd_hidden': 512, 'n_layers': 6,
'n_heads': 8, 'drop_ratio': 0.1, 'warmup': 16000} # ~32
args.__dict__.update(hparams)
# ----------------------------------------------------------------------------------------------------------------- #
# show the arg:
# hp_str = (f"{args.dataset}_subword_"
# f"{args.d_model}_{args.d_hidden}_{args.n_layers}_{args.n_heads}_"
opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, opt.load_checkpoint
)
checkpoint = Checkpoint.load(checkpoint_path)
seq2seq = checkpoint.model
input_vocab = checkpoint.input_vocab
output_vocab = checkpoint.output_vocab
else:
# Prepare dataset
src = SourceField()
tgt = TargetField()
max_len = 50
def len_filter(example):
return len(example.src) <= max_len and len(example.tgt) <= max_len
train = torchtext.data.TabularDataset(
path=opt.train_path,
format="tsv",
fields=[("src", src), ("tgt", tgt)],
filter_pred=len_filter,
)
dev = torchtext.data.TabularDataset(
path=opt.dev_path,
format="tsv",
fields=[("src", src), ("tgt", tgt)],
filter_pred=len_filter,
)
src.build_vocab(train, max_size=50000)
tgt.build_vocab(train, max_size=50000)
input_vocab = src.vocab
output_vocab = tgt.vocab
logger.info(f"Average sentence length: {(lengths / counter):0.3f}")
train_err = 0.0
train_total = 0.0
start_time = time.time()
self.model.train()
# Each time we will clear and reload the train_instances_cache
instances = self.train_instances_cache
random.shuffle(self.train_instances_cache)
data_iterator = torchtext.data.iterator.pool(
instances, self.config_data.batch_size_tokens,
key=lambda x: x.length(), # length of word_ids
batch_size_fn=batch_size_fn,
random_shuffler=torchtext.data.iterator.RandomShuffler())
step = 0
for batch in data_iterator:
step += 1
batch_data = self.get_batch_tensor(batch, device=self.device)
word, char, labels, masks, lengths = batch_data
self.optim.zero_grad()
loss = self.model(word, char, labels, mask=masks)
loss.backward()
self.optim.step()
num_inst = word.size(0)
train_err += loss.item() * num_inst
train_total += num_inst
return tgt_mask
global max_src_in_batch, max_tgt_in_batch
def batch_size_fn(new, count, sofar):
"Keep augmenting batch and calculate total number of tokens + padding."
global max_src_in_batch, max_tgt_in_batch
if count == 1:
max_src_in_batch = 0
max_tgt_in_batch = 0
max_src_in_batch = max(max_src_in_batch, len(new.src))
max_tgt_in_batch = max(max_tgt_in_batch, len(new.trg) + 2)
src_elements = count * max_src_in_batch
tgt_elements = count * max_tgt_in_batch
return max(src_elements, tgt_elements)
class MyIterator(data.Iterator):
def create_batches(self):
if self.train:
def pool(d, random_shuffler):
for p in data.batch(d, self.batch_size * 100):
p_batch = data.batch(
sorted(p, key=self.sort_key),
self.batch_size, self.batch_size_fn)
for b in random_shuffler(list(p_batch)):
yield b
self.batches = pool(self.data(), self.random_shuffler)
else:
self.batches = []
for b in data.batch(self.data(), self.batch_size,
self.batch_size_fn):
self.batches.append(sorted(b, key=self.sort_key))
device: int,
pad_token='',
unk_token='',
bos_token='',
eos_token='',
seed=777):
numpy.random.seed(seed)
self.sent_dict = self._gathered_by_lengths(sentences)
self.pad_token = pad_token
self.unk_token = unk_token
self.bos_token = bos_token
self.eos_token = eos_token
self.device = device
self.sentence_field = data.Field(use_vocab=True,
unk_token=self.unk_token,
pad_token=self.pad_token,
init_token=self.bos_token,
eos_token=self.eos_token,
batch_first=True,
include_lengths=False)
self.sentence_id_field = data.Field(use_vocab=False, batch_first=True)
self.sentence_field.build_vocab(sentences, min_freq=min_freq)
self.vocab = self.sentence_field.vocab
if self.pad_token:
self.pad_index = self.sentence_field.vocab.stoi[self.pad_token]
self.dataset = self._create_dataset(self.sent_dict, sentences)