How to use the torchtext.data.Iterator function in torchtext

To help you get started, we’ve selected a few torchtext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pytorch / text / test / data / test_dataset.py View on Github external
path=self.test_has_header_dataset_path, format=format_,
                skip_header=False, fields=fields)

            TEXT.build_vocab(dataset)

            for i, example in enumerate(dataset):
                self.assertEqual(example.text,
                                 example_with_header[i + 1][0].lower().split())
                self.assertEqual(example.label, example_with_header[i + 1][1])

            # check that the vocabulary is built correctly (#225)
            expected_freqs = {"hello": 1, "world": 2, "goodbye": 1, "text": 0}
            for k, v in expected_freqs.items():
                self.assertEqual(TEXT.vocab.freqs[k], v)

            data_iter = data.Iterator(dataset, batch_size=1,
                                      sort_within_batch=False, repeat=False)
            next(data_iter.__iter__())
github castorini / BuboQA / entity_detection / nn / train.py View on Github external
for i, token in enumerate(TEXT.vocab.itos):
        wv_index = stoi.get(token, None)
        if wv_index is not None:
            TEXT.vocab.vectors[i] = vectors[wv_index]
            match_embedding += 1
        else:
            TEXT.vocab.vectors[i] = torch.FloatTensor(dim).uniform_(-0.25, 0.25)
else:
    print("Error: Need word embedding pt file")
    exit(1)

print("Embedding match number {} out of {}".format(match_embedding, len(TEXT.vocab)))

train_iter = data.Iterator(train, batch_size=args.batch_size, device=args.gpu, train=True, repeat=False,
                                   sort=False, shuffle=True, sort_within_batch=False)
dev_iter = data.Iterator(dev, batch_size=args.batch_size, device=args.gpu, train=False, repeat=False,
                                   sort=False, shuffle=False, sort_within_batch=False)
test_iter = data.Iterator(test, batch_size=args.batch_size, device=args.gpu, train=False, repeat=False,
                                   sort=False, shuffle=False, sort_within_batch=False)

config = args
config.words_num = len(TEXT.vocab)

if args.dataset == 'EntityDetection':
    config.label = len(ED.vocab)
    model = EntityDetection(config)
else:
    print("Error Dataset")
    exit()

model.embed.weight.data.copy_(TEXT.vocab.vectors)
if args.cuda:
github mandarjoshi90 / pair2vec / noallen / lexinf / train.py View on Github external
tokens = text.rstrip().split('\n')
    specials = ['', '', '', '']
    #args_field.vocab = Vocab(tokens, specials=specials, vectors='glove.6B.200d', vectors_cache='/glove')
    args_field.vocab = Vocab(tokens, specials=specials) #, vectors='fasttext.en', vectors_cache='/fasttext')
    config.n_args = len(args_field.vocab)

    # data
    train_data = TabularDataset(path=config.train_data_path, format='tsv', fields = [('word1', args_field), ('word2', args_field), ('label', label_field)])
    dev_data = TabularDataset(path=config.dev_data_path, format='tsv', fields = [('word1', args_field), ('word2', args_field), ('label', label_field)])
    test_data = TabularDataset(path=config.test_data_path, format='tsv', fields = [('word1', args_field), ('word2', args_field), ('label', label_field)]) if hasattr(config, 'test_data_path') else None
    label_field.build_vocab(train_data, dev_data)

    # iter
    train_iter = Iterator(train_data, train=True, shuffle=True, repeat=False, batch_size=config.train_batch_size)
    dev_iter = Iterator(dev_data, train=False, shuffle=True, repeat=False, sort=False, batch_size=config.dev_batch_size)
    test_iter = Iterator(test_data, train=False, shuffle=True, repeat=False, sort=False, batch_size=config.dev_batch_size) if test_data is not None else None

    # relemb model
    relation_embedding_model = RelationalEmbeddingModel(config, args_field.vocab, args_field.vocab)
    load_model(config.model_file, relation_embedding_model)
    for param in relation_embedding_model.parameters():
        param.requires_grad = False
    relation_embedding_model.eval()
    relation_embedding_model.cuda()

    # glove
    #args_field.vocab.load_vectors(vectors='glove.6B.100d', cache='/glove')
    # args_field.vocab.load_vectors(vectors='glove.6B.300d', cache='/glove')
    args_field.vocab.load_vectors(vectors='fasttext.en.300d', cache='/fasttext')
    glove = Embedding(len(args_field.vocab),300)
    glove.requires_grad = False
    pretrained = normalize(args_field.vocab.vectors, dim=-1)
github bamtercelboo / cnn-lstm-bilstm-deepcnn-clstm-in-pytorch / main.py View on Github external
:param text_field: text dict for finetune
    :param label_field: label dict for finetune
    :param static_text_field: text dict for static(no finetune)
    :param static_label_field: label dict for static(no finetune)
    :param kargs: others arguments
    :return: batch train, batch dev, batch test
    """
    train_data, dev_data, test_data = mydatasets_self_five.MR.splits(path, train_name, dev_name, test_name, char_data, text_field, label_field)
    static_train_data, static_dev_data, static_test_data = mydatasets_self_five.MR.splits(path, train_name, dev_name, test_name, char_data, static_text_field, static_label_field)
    print("len(train_data) {} ".format(len(train_data)))
    print("len(static_train_data) {} ".format(len(static_train_data)))
    text_field.build_vocab(train_data, min_freq=config.min_freq)
    label_field.build_vocab(train_data)
    static_text_field.build_vocab(static_train_data, static_dev_data, static_test_data, min_freq=config.min_freq)
    static_label_field.build_vocab(static_train_data, static_dev_data, static_test_data)
    train_iter, dev_iter, test_iter = data.Iterator.splits((train_data, dev_data, test_data), batch_sizes=(config.batch_size, len(dev_data), len(test_data)), device=-1, **kargs)
    return train_iter, dev_iter, test_iter
github castorini / BuboQA / ferhan_simple_qa_rnn / entity_detection / train.py View on Github external
labels.build_vocab(train, dev, test)

if os.path.isfile(args.vector_cache):
    questions.vocab.vectors = torch.load(args.vector_cache)
else:
    questions.vocab.load_vectors(wv_dir=args.data_cache, wv_type=args.word_vectors, wv_dim=args.d_embed)
    os.makedirs(os.path.dirname(args.vector_cache), exist_ok=True)
    torch.save(questions.vocab.vectors, args.vector_cache)

# Buckets
# train_iters, dev_iters, test_iters = data.BucketIterator.splits(
#     (train, dev, test), batch_size=args.batch_size, device=args.gpu)

train_iter = data.Iterator(train, batch_size=args.batch_size, device=args.gpu, train=True, repeat=False,
                                   sort=False, shuffle=True)
dev_iter = data.Iterator(dev, batch_size=args.batch_size, device=args.gpu, train=True, repeat=False,
                                   sort=False, shuffle=False)
test_iter = data.Iterator(test, batch_size=args.batch_size, device=args.gpu, train=True, repeat=False,
                                   sort=False, shuffle=False)

# define models

config = args
config.n_embed = len(questions.vocab)
config.n_out = len(labels.vocab) # I/in entity  O/out of entity
config.n_cells = config.n_layers

if config.birnn:
    config.n_cells *= 2
print(config)

if args.resume_snapshot:
github ari-holtzman / l2w / trainers / train_classifier.py View on Github external
with open(args.dic, 'rb') as dic_file:
    dictionary = pickle.load(dic_file)

# Reconstruct the dictionary in torchtext.
counter = Counter({'': 0, '':0})
TEXT.vocab = vocab.Vocab(counter, specials=['', ''])
TEXT.vocab.itos = dictionary.idx2word
TEXT.vocab.stoi = defaultdict(vocab._default_unk_index, dictionary.word2idx)

TEXT.vocab.load_vectors('glove.6B.%dd' % args.embedding_dim)
itos = TEXT.vocab.itos if args.p else None
print('Vocab size %d' % len(TEXT.vocab))

train_iter = data.Iterator(dataset=train, batch_size=args.batch_size,
        sort_key=lambda x: len(x.context), sort=True, repeat=False)
valid_iter = data.Iterator(dataset=valid, batch_size=args.batch_size, sort_key=lambda x: len(x.context), sort=True, repeat=False)

print('Initializing the model')

if args.load_model != '':
    with open(args.load_model, 'rb') as f:
        model = torch.load(f).cuda()
elif args.decider_type == 'cnncontext':
    model = CNNContextClassifier(len(TEXT.vocab), args.embedding_dim,
            args.hidden_dim, args.filter_size, args.dropout_rate,
            embed_mat=TEXT.vocab.vectors,
            fix_embeddings=args.fix_embeddings).cuda()
elif args.decider_type == 'poolending':
    model = PoolEndingClassifier(len(TEXT.vocab), args.embedding_dim,
            args.hidden_dim,
            embed_mat=TEXT.vocab.vectors,
            fix_embeddings=args.fix_embeddings).cuda()
github castorini / BuboQA / ferhan_simple_qa_rnn / entity_detection / train.py View on Github external
questions.build_vocab(train, dev, test) # Test dataset can not be used here for constructing the vocab
# build vocab for tags
labels.build_vocab(train, dev, test)

if os.path.isfile(args.vector_cache):
    questions.vocab.vectors = torch.load(args.vector_cache)
else:
    questions.vocab.load_vectors(wv_dir=args.data_cache, wv_type=args.word_vectors, wv_dim=args.d_embed)
    os.makedirs(os.path.dirname(args.vector_cache), exist_ok=True)
    torch.save(questions.vocab.vectors, args.vector_cache)

# Buckets
# train_iters, dev_iters, test_iters = data.BucketIterator.splits(
#     (train, dev, test), batch_size=args.batch_size, device=args.gpu)

train_iter = data.Iterator(train, batch_size=args.batch_size, device=args.gpu, train=True, repeat=False,
                                   sort=False, shuffle=True)
dev_iter = data.Iterator(dev, batch_size=args.batch_size, device=args.gpu, train=True, repeat=False,
                                   sort=False, shuffle=False)
test_iter = data.Iterator(test, batch_size=args.batch_size, device=args.gpu, train=True, repeat=False,
                                   sort=False, shuffle=False)

# define models

config = args
config.n_embed = len(questions.vocab)
config.n_out = len(labels.vocab) # I/in entity  O/out of entity
config.n_cells = config.n_layers

if config.birnn:
    config.n_cells *= 2
print(config)
github michaelchen110 / Grammar-Correction / transformer / transformer_pred.py View on Github external
EOS_WORD = ''
    BLANK_WORD = ""
    MIN_FREQ = 2

    spacy_en = spacy.load('en')

    def tokenize_en(text):
        return [tok.text for tok in spacy_en.tokenizer(text)]

    TEXT = data.Field(tokenize=tokenize_en, init_token = BOS_WORD,
                     eos_token = EOS_WORD, pad_token=BLANK_WORD)

    test = datasets.TranslationDataset(path=os.path.join(SRC_DIR, DATA), 
            exts=('.test.src', '.test.trg'), fields=(TEXT, TEXT))
    # use the same order as original data
    test_iter = data.Iterator(test, batch_size=BATCH_SIZE, device=device, 
                              sort=False, repeat=False, train=False)

    random_idx = random.randint(0, len(test) - 1)
    print(test[random_idx].src)
    print(test[random_idx].trg)

    ###############
    #  Vocabuary  #
    ###############
    TEXT.vocab = torch.load(vocab_file)
    pad_idx = TEXT.vocab.stoi[""]

    print("Load %s vocabuary; vocab size = %d" % (DATA, len(TEXT.vocab)))
    #####################
    #   Word Embedding  #
    #####################