Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def udpos_dataset(batch_size):
# Setup fields with batch dimension first
inputs = data.Field(init_token="", eos_token="", batch_first=True)
tags = data.Field(init_token="", eos_token="", batch_first=True)
# Download and the load default data.
train, val, test = datasets.UDPOS.splits(
fields=(('inputs_word', inputs), ('labels', tags), (None, None)))
# Build vocab
inputs.build_vocab(train.inputs)
tags.build_vocab(train.tags)
# Get iterators
train_iter, val_iter, test_iter = data.BucketIterator.splits(
(train, val, test), batch_size=batch_size,
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"))
train_iter.repeat = False
return train_iter, val_iter, test_iter, inputs, tags
print('Loading dataset...')
train_data, test_data = KCDataset.splits(TEXT, LABEL, root='../data')
train_data, valid_data = train_data.split(random_state=random.seed(SEED))
# Build the vocab
print('Building vocab...')
TEXT.build_vocab(train_data)
LABEL.build_vocab(train_data)
# And create the iterators.
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size=BATCH_SIZE,
device=device)
# As previously, we'll create an instance of our `FastText` class.
INPUT_DIM = len(TEXT.vocab)
# model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM)
model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS,
FILTER_SIZES, OUTPUT_DIM, DROPOUT)
# Train the Model
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
# load the IMDB data
if arg.final:
train, test = datasets.IMDB.splits(TEXT, LABEL)
TEXT.build_vocab(train, max_size=arg.vocab_size - 2)
LABEL.build_vocab(train)
train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=arg.batch_size, device=util.d())
else:
tdata, _ = datasets.IMDB.splits(TEXT, LABEL)
train, test = tdata.split(split_ratio=0.8)
TEXT.build_vocab(train, max_size=arg.vocab_size - 2) # - 2 to make space for and
LABEL.build_vocab(train)
train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=arg.batch_size, device=util.d())
print(f'- nr. of training examples {len(train_iter)}')
print(f'- nr. of {"test" if arg.final else "validation"} examples {len(test_iter)}')
if arg.max_length < 0:
mx = max([input.text[0].size(1) for input in train_iter])
mx = mx * 2
print(f'- maximum sequence length: {mx}')
else:
mx = arg.max_length
# create the model
model = former.CTransformer(emb=arg.embedding_size, heads=arg.num_heads, depth=arg.depth, seq_length=mx, num_tokens=arg.vocab_size, num_classes=NUM_CLS, max_pool=arg.max_pool)
if torch.cuda.is_available():
model.cuda()
train, test = datasets.IMDB.splits(TEXT, LABEL)
elif opt.dataset=="sst":
train, val, test = datasets.SST.splits( TEXT, LABEL, fine_grained=True, train_subtrees=True,
filter_pred=lambda ex: ex.label != 'neutral')
elif opt.dataset=="trec":
train, test = datasets.TREC.splits(TEXT, LABEL, fine_grained=True)
else:
print("does not support this datset")
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
LABEL.build_vocab(train)
# print vocab information
print('len(TEXT.vocab)', len(TEXT.vocab))
print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())
train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=opt.batch_size,device=device,repeat=False,shuffle=True)
opt.label_size= len(LABEL.vocab)
opt.vocab_size = len(TEXT.vocab)
opt.embedding_dim= TEXT.vocab.vectors.size()[1]
opt.embeddings = TEXT.vocab.vectors
return train_iter, test_iter
def sst(text_field, label_field, **kargs):
print("SST")
train_data, dev_data, test_data = sstdatasets.SST.splits(text_field, label_field, fine_grained=True)
print("len(train_data) {} ".format(len(train_data)))
text_field.build_vocab(train_data, dev_data, test_data)
label_field.build_vocab(train_data, dev_data, test_data)
train_iter, dev_iter, test_iter = data.BucketIterator.splits(
(train_data, dev_data, test_data),
batch_sizes=(args.batch_size,
len(dev_data),
len(test_data)),
**kargs)
return train_iter, dev_iter, test_iter
url = re.compile('(.*)')
def tokenize_de(text):
return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))]
def tokenize_en(text):
return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))]
DE = Field(tokenize=tokenize_de, include_lengths=True,
init_token='', eos_token='')
EN = Field(tokenize=tokenize_en, include_lengths=True,
init_token='', eos_token='')
train, val, test = Multi30k.splits(exts=('.de', '.en'), fields=(DE, EN))
DE.build_vocab(train.src, min_freq=2)
EN.build_vocab(train.trg, max_size=10000)
train_iter, val_iter, test_iter = BucketIterator.splits(
(train, val, test), batch_size=batch_size, repeat=False)
return train_iter, val_iter, test_iter, DE, EN
def sst(text_field, label_field, **kargs):
train_data, dev_data, test_data = datasets.SST.splits(text_field, label_field, fine_grained=True)
text_field.build_vocab(train_data, dev_data, test_data)
label_field.build_vocab(train_data, dev_data, test_data)
train_iter, dev_iter, test_iter = data.BucketIterator.splits(
(train_data, dev_data, test_data),
batch_sizes=(args.batch_size,
len(dev_data),
len(test_data)),
**kargs)
return train_iter, dev_iter, test_iter
if not args.fine_grained:
filter_pred = lambda ex: ex.label != 'neutral'
dataset_splits = datasets.SST.splits(
root=args.datadir, text_field=text_field, label_field=label_field,
fine_grained=args.fine_grained, train_subtrees=True,
filter_pred=filter_pred)
test_dataset = dataset_splits[2]
text_field.build_vocab(*dataset_splits)
label_field.build_vocab(*dataset_splits)
text_field.vocab.id_to_word = lambda i: text_field.vocab.itos[i]
text_field.vocab.id_to_tf = lambda i: text_field.freqs[i]
print(f'Number of classes: {len(label_field.vocab)}')
_, _, test_loader = data.BucketIterator.splits(
datasets=dataset_splits, batch_size=args.batch_size, device=args.gpu)
num_classes = len(label_field.vocab)
model = SSTModel(
typ='RL-SA',
vocab=text_field.vocab,
num_classes=num_classes, num_words=len(text_field.vocab),
word_dim=args.word_dim, hidden_dim=args.hidden_dim,
clf_hidden_dim=args.clf_hidden_dim,
clf_num_layers=args.clf_num_layers,
use_leaf_rnn=args.leaf_rnn,
use_batchnorm=args.batchnorm,
dropout_prob=args.dropout,
bidirectional=args.bidirectional,
cell_type=args.cell_type,
att_type=args.att_type,
else:
return max(len(new.src), len(new.trg), prev_max_len) * i
def dyn_batch_without_padding(new, i, sofar):
if args.distillation:
return sofar + max(len(new.src), len(new.trg), len(new.dec))
else:
return sofar + max(len(new.src), len(new.trg))
if args.batch_size == 1: # speed-test: one sentence per batch.
batch_size_fn = lambda new, count, sofar: count
else:
batch_size_fn = dyn_batch_with_padding # dyn_batch_without_padding
train_real, dev_real = data.BucketIterator.splits(
(train_data, dev_data), batch_sizes=(args.batch_size, args.valid_batch_size), device=args.gpu, shuffle=False,
batch_size_fn=batch_size_fn, repeat=None if args.mode == 'train' else False)
aux_reals = [data.BucketIterator(dataset, batch_size=args.batch_size, device=args.gpu, train=True, batch_size_fn=batch_size_fn, shuffle=False)
for dataset in aux_data]
logger.info("build the dataset. done!")
# ----------------------------------------------------------------------------------------------------------------- #
# model hyper-params:
logger.info('use default parameters of t2t-base')
hparams = {'d_model': 512, 'd_hidden': 512, 'n_layers': 6,
'n_heads': 8, 'drop_ratio': 0.1, 'warmup': 16000} # ~32
args.__dict__.update(hparams)
# ----------------------------------------------------------------------------------------------------------------- #
# show the arg:
# 2. data.TabularDataset
train_data, valid_data, test_data = data.TabularDataset.splits(path=dataset_path,
train="train.csv",
validation="valid.csv",
test="test.csv",
fields=[('text', TEXT), ('rating', RATING_LABEL), ('gender', GENDER_LABEL),
('age', AGE_LABEL), ('location', LOCALTION_LABEL)],
format="csv")
print("Number of train_data = {}".format(len(train_data)))
print("Number of valid_data = {}".format(len(valid_data)))
print("Number of test_data = {}".format(len(test_data)))
print("vars(train_data[0]) = {}\n".format(vars(train_data[0])))
# 3. data.BucketIterator
train_iter, valid_iter, test_iter = data.BucketIterator.splits((train_data, valid_data, test_data),
batch_size=BATCH_SIZE,
device=device,
sort_key=lambda x: len(x.text))
# 4. Build vocab
# TEXT.build_vocab(train_data)
# unk_init=torch.Tensor.normal_)
# LABELS.build_vocab(train_data)
# print("vars(train_data[0]) = ", vars(train_data[0]))
# 4.1 (Optional) If build vocab with pre-trained word embedding vectors
TEXT.build_vocab(train_data, vectors="glove.6B.100d")
RATING_LABEL.build_vocab(train_data)
GENDER_LABEL.build_vocab(train_data)
AGE_LABEL.build_vocab(train_data)
LOCALTION_LABEL.build_vocab(train_data)