Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
from torchtext import data
from torchtext import datasets
from torchtext.vocab import Vectors, GloVe, CharNGram, FastText
# Approach 1:
# set up fields
TEXT = data.Field()
LABEL = data.Field(sequential=False)
# make splits for data
train, val, test = datasets.SST.splits(
TEXT, LABEL, fine_grained=True, train_subtrees=True,
filter_pred=lambda ex: ex.label != 'neutral')
# print information about the data
print('train.fields', train.fields)
print('len(train)', len(train))
print('vars(train[0])', vars(train[0]))
# build the vocabulary
url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.vec'
TEXT.build_vocab(train, vectors=Vectors('wiki.simple.vec', url=url))
LABEL.build_vocab(train)
# print vocab information
print('len(TEXT.vocab)', len(TEXT.vocab))
print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())
def evaluate(args):
lstm_hidden_dims = [int(d) for d in args.lstm_hidden_dims.split(',')]
logging.info('Loading data...')
text_field = data.Field(lower=True, include_lengths=True,
batch_first=False)
label_field = data.Field(sequential=False)
if not os.path.exists(args.data_dir):
os.makedirs(args.data_dir)
dataset_splits = datasets.SNLI.splits(
text_field=text_field, label_field=label_field, root=args.data_dir)
test_dataset = dataset_splits[2]
text_field.build_vocab(*dataset_splits)
label_field.build_vocab(*dataset_splits)
_, _, test_loader = data.BucketIterator.splits(
datasets=dataset_splits, batch_size=args.batch_size, device=args.gpu)
logging.info('Building model...')
num_classes = len(label_field.vocab)
num_words = len(text_field.vocab)
model = NLIModel(num_words=num_words, word_dim=args.word_dim,
lstm_hidden_dims=lstm_hidden_dims,
mlp_hidden_dim=args.mlp_hidden_dim,
mlp_num_layers=args.mlp_num_layers,
num_classes=num_classes, dropout_prob=0)
model.load_state_dict(torch.load(args.model_path))
def get_mt_datasets(exts, fields, train_path, val_path, test_path=""):
train = datasets.TranslationDataset(
path=train_path, exts=exts, fields=fields)
val = datasets.TranslationDataset(path=val_path, exts=exts, fields=fields)
return train, val, None
def tokenize_en(text):
return [tok.text for tok in spacy_en.tokenizer(text)]
TEXT = data.Field(tokenize=tokenize_en, init_token = BOS_WORD,
eos_token = EOS_WORD, pad_token=BLANK_WORD)
###############
# Vocabuary #
###############
if os.path.exists(vocab_file):
print("Building vocabuary...")
TEXT.vocab = torch.load(vocab_file)
else:
print("Loading data...")
train = datasets.TranslationDataset(path=os.path.join(src_dir,
DATA), exts=('.train.src', '.train.trg'), fields=(TEXT, TEXT))
MIN_FREQ = 2
TEXT.build_vocab(train.src, min_freq=MIN_FREQ)
#########################
# Save in count order #
#########################
ordered_words = [word for word, _ in TEXT.vocab.freqs.most_common()]
with open(vocab_freq_file, 'w') as f:
print('Writing...')
f.write('<s>\n</s>\n\n')
for word in ordered_words:
f.write(word + '\n')
for src, trg in zip(batch.src.examples(), batch.trg.examples()):
logits = self(src, trg[:, :-1])
loss += F.cross_entropy(logits, trg[:, 1:], reduce=reduce)
return loss
logits = self(batch.src, batch.trg[:, :-1])
return F.cross_entropy(logits, batch.trg[:, 1:], reduce=reduce)
if __name__ == '__main__':
import sys
unbatch = sys.argv[1] == '1'
small = sys.argv[2] == '1'
if sys.argv[3] == '1':
TEXT = data.Field(batch_first=True)
else:
TEXT = MaskedBatchField(batch_first=True)
train, dev, test = datasets.IWSLT.splits(('.de', '.en'), (TEXT, TEXT))
TEXT.build_vocab(train, max_size=50000)
random.seed(0)
torch.manual_seed(0)
train_iter = data.BucketIterator(
train, batch_size=32, device=0 if torch.cuda.is_available() else -1)
args = argparse.Namespace()
args.__dict__.update(d_model=8 if small else 512,
d_hidden=1 if small else 2048,
n_heads=8, drop_ratio=0,
n_layers=6, length_ratio=1.5)
model = Transformer(TEXT, TEXT, args)
if torch.cuda.is_available(): model.cuda()
for i, b in enumerate(train_iter):
if i == 1:
t = time.time()
if i == 2:
def loadData(opt):
if not opt.from_torchtext:
import dataHelper as helper
return helper.loadData(opt)
device = 0 if torch.cuda.is_available() else -1
TEXT = data.Field(lower=True, include_lengths=True, batch_first=True,fix_length=opt.max_seq_len)
LABEL = data.Field(sequential=False)
if opt.dataset=="imdb":
train, test = datasets.IMDB.splits(TEXT, LABEL)
elif opt.dataset=="sst":
train, val, test = datasets.SST.splits( TEXT, LABEL, fine_grained=True, train_subtrees=True,
filter_pred=lambda ex: ex.label != 'neutral')
elif opt.dataset=="trec":
train, test = datasets.TREC.splits(TEXT, LABEL, fine_grained=True)
else:
print("does not support this datset")
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
LABEL.build_vocab(train)
# print vocab information
print('len(TEXT.vocab)', len(TEXT.vocab))
print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())
train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=opt.batch_size,device=device,repeat=False,shuffle=True)
# Data Loading #
#####################
BOS_WORD = '<s>'
EOS_WORD = '</s>'
BLANK_WORD = ""
MIN_FREQ = 2
spacy_en = spacy.load('en')
def tokenize_en(text):
return [tok.text for tok in spacy_en.tokenizer(text)]
TEXT = data.Field(tokenize=tokenize_en, init_token = BOS_WORD,
eos_token = EOS_WORD, pad_token=BLANK_WORD)
train = datasets.TranslationDataset(path=os.path.join(SRC_DIR, DATA),
exts=('.train.src', '.train.trg'), fields=(TEXT, TEXT))
val = datasets.TranslationDataset(path=os.path.join(SRC_DIR, DATA),
exts=('.val.src', '.val.trg'), fields=(TEXT, TEXT))
train_iter = MyIterator(train, batch_size=BATCH_SIZE, device=device,
repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
batch_size_fn=batch_size_fn, train=True)
valid_iter = MyIterator(val, batch_size=BATCH_SIZE, device=device,
repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
batch_size_fn=batch_size_fn, train=False)
random_idx = random.randint(0, len(train) - 1)
print(train[random_idx].src)
print(train[random_idx].trg)
###############
# Vocabuary #
###############
from torchtext import datasets
from torchtext.vocab import GloVe, CharNGram
import torch
if not torch.cuda.is_available() :
device = -1
else:
device = 0
# Approach 1:
# set up fields
TEXT = data.Field(lower=True, include_lengths=True, batch_first=True)
LABEL = data.Field(sequential=False)
# make splits for data
train, test = datasets.TREC.splits(TEXT, LABEL, fine_grained=True)
# print information about the data
print('train.fields', train.fields)
print('len(train)', len(train))
print('vars(train[0])', vars(train[0]))
# build the vocabulary
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
LABEL.build_vocab(train)
# print vocab information
print('len(TEXT.vocab)', len(TEXT.vocab))
print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())
# make iterator for splits
train_iter, test_iter = data.BucketIterator.splits(
print ' initialize with pretrained vectors: %s' % config['pretrained']
print ' number of classes: %d' % num_classes
print ' number of tokens: %d' % num_tokens
print ' max batch size: %d' % config['batch_size']
# sort_within_batch is set True because we may use nn.LSTM
train_loader, valid_loader = data.BucketIterator.splits(
datasets=dataset_splits, batch_size=config['batch_size'],
sort_within_batch=True, device=config['gpu'])
elif config['dataset'] == 'SST':
filter_pred = None
if not config['fine_grained']:
filter_pred = lambda ex: ex.label != 'neutral'
dataset_splits = datasets.SST.splits(
root='data', text_field=text_field, label_field=label_field,
fine_grained=config['fine_grained'], filter_pred=filter_pred)
text_field.build_vocab(*dataset_splits, vectors=config['pretrained'])
label_field.build_vocab(*dataset_splits)
num_tokens = len(text_field.vocab)
num_classes = len(label_field.vocab)
PAD_ID = text_field.vocab.stoi['']
dataset_info = {'num_tokens':num_tokens,
'num_classes':num_classes,
'PAD_ID':PAD_ID}
print ' initialize with pretrained vectors: %s' % config['pretrained']
print ' number of classes: %d' % num_classes
print ' number of tokens: %d' % num_tokens
print ' max batch size: %d' % config['batch_size']
def main(config):
vocab_len = get_babi_vocab(config.task)
train_iter, val_iter, test_iter = datasets.BABI20.iters(batch_size=config.batch_size,
root='.data',
memory_size=70,
task=config.task,
joint=False,
tenK=False,
only_supporting=False,
sort=False,
shuffle=True)
model = BabiUTransformer(num_vocab=vocab_len,
embedding_size=config.emb,
hidden_size=config.emb,
num_layers=config.max_hops,
num_heads=config.heads,
total_key_depth=config.depth,
total_value_depth=config.depth,
filter_size=config.filter,