Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
path=self.test_has_header_dataset_path, format=format_,
skip_header=False, fields=fields)
TEXT.build_vocab(dataset)
for i, example in enumerate(dataset):
self.assertEqual(example.text,
example_with_header[i + 1][0].lower().split())
self.assertEqual(example.label, example_with_header[i + 1][1])
# check that the vocabulary is built correctly (#225)
expected_freqs = {"hello": 1, "world": 2, "goodbye": 1, "text": 0}
for k, v in expected_freqs.items():
self.assertEqual(TEXT.vocab.freqs[k], v)
data_iter = data.Iterator(dataset, batch_size=1,
sort_within_batch=False, repeat=False)
next(data_iter.__iter__())
for i, token in enumerate(TEXT.vocab.itos):
wv_index = stoi.get(token, None)
if wv_index is not None:
TEXT.vocab.vectors[i] = vectors[wv_index]
match_embedding += 1
else:
TEXT.vocab.vectors[i] = torch.FloatTensor(dim).uniform_(-0.25, 0.25)
else:
print("Error: Need word embedding pt file")
exit(1)
print("Embedding match number {} out of {}".format(match_embedding, len(TEXT.vocab)))
train_iter = data.Iterator(train, batch_size=args.batch_size, device=args.gpu, train=True, repeat=False,
sort=False, shuffle=True, sort_within_batch=False)
dev_iter = data.Iterator(dev, batch_size=args.batch_size, device=args.gpu, train=False, repeat=False,
sort=False, shuffle=False, sort_within_batch=False)
test_iter = data.Iterator(test, batch_size=args.batch_size, device=args.gpu, train=False, repeat=False,
sort=False, shuffle=False, sort_within_batch=False)
config = args
config.words_num = len(TEXT.vocab)
if args.dataset == 'EntityDetection':
config.label = len(ED.vocab)
model = EntityDetection(config)
else:
print("Error Dataset")
exit()
model.embed.weight.data.copy_(TEXT.vocab.vectors)
if args.cuda:
tokens = text.rstrip().split('\n')
specials = ['', '', '', '']
#args_field.vocab = Vocab(tokens, specials=specials, vectors='glove.6B.200d', vectors_cache='/glove')
args_field.vocab = Vocab(tokens, specials=specials) #, vectors='fasttext.en', vectors_cache='/fasttext')
config.n_args = len(args_field.vocab)
# data
train_data = TabularDataset(path=config.train_data_path, format='tsv', fields = [('word1', args_field), ('word2', args_field), ('label', label_field)])
dev_data = TabularDataset(path=config.dev_data_path, format='tsv', fields = [('word1', args_field), ('word2', args_field), ('label', label_field)])
test_data = TabularDataset(path=config.test_data_path, format='tsv', fields = [('word1', args_field), ('word2', args_field), ('label', label_field)]) if hasattr(config, 'test_data_path') else None
label_field.build_vocab(train_data, dev_data)
# iter
train_iter = Iterator(train_data, train=True, shuffle=True, repeat=False, batch_size=config.train_batch_size)
dev_iter = Iterator(dev_data, train=False, shuffle=True, repeat=False, sort=False, batch_size=config.dev_batch_size)
test_iter = Iterator(test_data, train=False, shuffle=True, repeat=False, sort=False, batch_size=config.dev_batch_size) if test_data is not None else None
# relemb model
relation_embedding_model = RelationalEmbeddingModel(config, args_field.vocab, args_field.vocab)
load_model(config.model_file, relation_embedding_model)
for param in relation_embedding_model.parameters():
param.requires_grad = False
relation_embedding_model.eval()
relation_embedding_model.cuda()
# glove
#args_field.vocab.load_vectors(vectors='glove.6B.100d', cache='/glove')
# args_field.vocab.load_vectors(vectors='glove.6B.300d', cache='/glove')
args_field.vocab.load_vectors(vectors='fasttext.en.300d', cache='/fasttext')
glove = Embedding(len(args_field.vocab),300)
glove.requires_grad = False
pretrained = normalize(args_field.vocab.vectors, dim=-1)
:param text_field: text dict for finetune
:param label_field: label dict for finetune
:param static_text_field: text dict for static(no finetune)
:param static_label_field: label dict for static(no finetune)
:param kargs: others arguments
:return: batch train, batch dev, batch test
"""
train_data, dev_data, test_data = mydatasets_self_five.MR.splits(path, train_name, dev_name, test_name, char_data, text_field, label_field)
static_train_data, static_dev_data, static_test_data = mydatasets_self_five.MR.splits(path, train_name, dev_name, test_name, char_data, static_text_field, static_label_field)
print("len(train_data) {} ".format(len(train_data)))
print("len(static_train_data) {} ".format(len(static_train_data)))
text_field.build_vocab(train_data, min_freq=config.min_freq)
label_field.build_vocab(train_data)
static_text_field.build_vocab(static_train_data, static_dev_data, static_test_data, min_freq=config.min_freq)
static_label_field.build_vocab(static_train_data, static_dev_data, static_test_data)
train_iter, dev_iter, test_iter = data.Iterator.splits((train_data, dev_data, test_data), batch_sizes=(config.batch_size, len(dev_data), len(test_data)), device=-1, **kargs)
return train_iter, dev_iter, test_iter
labels.build_vocab(train, dev, test)
if os.path.isfile(args.vector_cache):
questions.vocab.vectors = torch.load(args.vector_cache)
else:
questions.vocab.load_vectors(wv_dir=args.data_cache, wv_type=args.word_vectors, wv_dim=args.d_embed)
os.makedirs(os.path.dirname(args.vector_cache), exist_ok=True)
torch.save(questions.vocab.vectors, args.vector_cache)
# Buckets
# train_iters, dev_iters, test_iters = data.BucketIterator.splits(
# (train, dev, test), batch_size=args.batch_size, device=args.gpu)
train_iter = data.Iterator(train, batch_size=args.batch_size, device=args.gpu, train=True, repeat=False,
sort=False, shuffle=True)
dev_iter = data.Iterator(dev, batch_size=args.batch_size, device=args.gpu, train=True, repeat=False,
sort=False, shuffle=False)
test_iter = data.Iterator(test, batch_size=args.batch_size, device=args.gpu, train=True, repeat=False,
sort=False, shuffle=False)
# define models
config = args
config.n_embed = len(questions.vocab)
config.n_out = len(labels.vocab) # I/in entity O/out of entity
config.n_cells = config.n_layers
if config.birnn:
config.n_cells *= 2
print(config)
if args.resume_snapshot:
with open(args.dic, 'rb') as dic_file:
dictionary = pickle.load(dic_file)
# Reconstruct the dictionary in torchtext.
counter = Counter({'': 0, '':0})
TEXT.vocab = vocab.Vocab(counter, specials=['', ''])
TEXT.vocab.itos = dictionary.idx2word
TEXT.vocab.stoi = defaultdict(vocab._default_unk_index, dictionary.word2idx)
TEXT.vocab.load_vectors('glove.6B.%dd' % args.embedding_dim)
itos = TEXT.vocab.itos if args.p else None
print('Vocab size %d' % len(TEXT.vocab))
train_iter = data.Iterator(dataset=train, batch_size=args.batch_size,
sort_key=lambda x: len(x.context), sort=True, repeat=False)
valid_iter = data.Iterator(dataset=valid, batch_size=args.batch_size, sort_key=lambda x: len(x.context), sort=True, repeat=False)
print('Initializing the model')
if args.load_model != '':
with open(args.load_model, 'rb') as f:
model = torch.load(f).cuda()
elif args.decider_type == 'cnncontext':
model = CNNContextClassifier(len(TEXT.vocab), args.embedding_dim,
args.hidden_dim, args.filter_size, args.dropout_rate,
embed_mat=TEXT.vocab.vectors,
fix_embeddings=args.fix_embeddings).cuda()
elif args.decider_type == 'poolending':
model = PoolEndingClassifier(len(TEXT.vocab), args.embedding_dim,
args.hidden_dim,
embed_mat=TEXT.vocab.vectors,
fix_embeddings=args.fix_embeddings).cuda()
questions.build_vocab(train, dev, test) # Test dataset can not be used here for constructing the vocab
# build vocab for tags
labels.build_vocab(train, dev, test)
if os.path.isfile(args.vector_cache):
questions.vocab.vectors = torch.load(args.vector_cache)
else:
questions.vocab.load_vectors(wv_dir=args.data_cache, wv_type=args.word_vectors, wv_dim=args.d_embed)
os.makedirs(os.path.dirname(args.vector_cache), exist_ok=True)
torch.save(questions.vocab.vectors, args.vector_cache)
# Buckets
# train_iters, dev_iters, test_iters = data.BucketIterator.splits(
# (train, dev, test), batch_size=args.batch_size, device=args.gpu)
train_iter = data.Iterator(train, batch_size=args.batch_size, device=args.gpu, train=True, repeat=False,
sort=False, shuffle=True)
dev_iter = data.Iterator(dev, batch_size=args.batch_size, device=args.gpu, train=True, repeat=False,
sort=False, shuffle=False)
test_iter = data.Iterator(test, batch_size=args.batch_size, device=args.gpu, train=True, repeat=False,
sort=False, shuffle=False)
# define models
config = args
config.n_embed = len(questions.vocab)
config.n_out = len(labels.vocab) # I/in entity O/out of entity
config.n_cells = config.n_layers
if config.birnn:
config.n_cells *= 2
print(config)
EOS_WORD = ''
BLANK_WORD = ""
MIN_FREQ = 2
spacy_en = spacy.load('en')
def tokenize_en(text):
return [tok.text for tok in spacy_en.tokenizer(text)]
TEXT = data.Field(tokenize=tokenize_en, init_token = BOS_WORD,
eos_token = EOS_WORD, pad_token=BLANK_WORD)
test = datasets.TranslationDataset(path=os.path.join(SRC_DIR, DATA),
exts=('.test.src', '.test.trg'), fields=(TEXT, TEXT))
# use the same order as original data
test_iter = data.Iterator(test, batch_size=BATCH_SIZE, device=device,
sort=False, repeat=False, train=False)
random_idx = random.randint(0, len(test) - 1)
print(test[random_idx].src)
print(test[random_idx].trg)
###############
# Vocabuary #
###############
TEXT.vocab = torch.load(vocab_file)
pad_idx = TEXT.vocab.stoi[""]
print("Load %s vocabuary; vocab size = %d" % (DATA, len(TEXT.vocab)))
#####################
# Word Embedding #
#####################