How to use the torchtext.datasets function in torchtext

To help you get started, we’ve selected a few torchtext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pytorch / text / test / sst.py View on Github external
from torchtext import data
from torchtext import datasets
from torchtext.vocab import Vectors, GloVe, CharNGram, FastText


# Approach 1:
# set up fields
TEXT = data.Field()
LABEL = data.Field(sequential=False)

# make splits for data
train, val, test = datasets.SST.splits(
    TEXT, LABEL, fine_grained=True, train_subtrees=True,
    filter_pred=lambda ex: ex.label != 'neutral')

# print information about the data
print('train.fields', train.fields)
print('len(train)', len(train))
print('vars(train[0])', vars(train[0]))

# build the vocabulary
url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.vec'
TEXT.build_vocab(train, vectors=Vectors('wiki.simple.vec', url=url))
LABEL.build_vocab(train)

# print vocab information
print('len(TEXT.vocab)', len(TEXT.vocab))
print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())
github jihunchoi / shortcut-stacked-encoder-pytorch / eval_snli.py View on Github external
def evaluate(args):
    lstm_hidden_dims = [int(d) for d in args.lstm_hidden_dims.split(',')]

    logging.info('Loading data...')
    text_field = data.Field(lower=True, include_lengths=True,
                            batch_first=False)
    label_field = data.Field(sequential=False)
    if not os.path.exists(args.data_dir):
        os.makedirs(args.data_dir)
    dataset_splits = datasets.SNLI.splits(
        text_field=text_field, label_field=label_field, root=args.data_dir)
    test_dataset = dataset_splits[2]
    text_field.build_vocab(*dataset_splits)
    label_field.build_vocab(*dataset_splits)
    _, _, test_loader = data.BucketIterator.splits(
        datasets=dataset_splits, batch_size=args.batch_size, device=args.gpu)

    logging.info('Building model...')
    num_classes = len(label_field.vocab)
    num_words = len(text_field.vocab)
    model = NLIModel(num_words=num_words, word_dim=args.word_dim,
                     lstm_hidden_dims=lstm_hidden_dims,
                     mlp_hidden_dim=args.mlp_hidden_dim,
                     mlp_num_layers=args.mlp_num_layers,
                     num_classes=num_classes, dropout_prob=0)
    model.load_state_dict(torch.load(args.model_path))
github akurniawan / pytorch-transformer / data.py View on Github external
def get_mt_datasets(exts, fields, train_path, val_path, test_path=""):
    train = datasets.TranslationDataset(
        path=train_path, exts=exts, fields=fields)
    val = datasets.TranslationDataset(path=val_path, exts=exts, fields=fields)
    return train, val, None
github michaelchen110 / Grammar-Correction / parser / prepare_vocab.py View on Github external
def tokenize_en(text):
        return [tok.text for tok in spacy_en.tokenizer(text)]

    TEXT = data.Field(tokenize=tokenize_en, init_token = BOS_WORD,
                     eos_token = EOS_WORD, pad_token=BLANK_WORD)


    ###############
    #  Vocabuary  #
    ###############
    if os.path.exists(vocab_file):
        print("Building vocabuary...")
        TEXT.vocab = torch.load(vocab_file)
    else:
        print("Loading data...")
        train = datasets.TranslationDataset(path=os.path.join(src_dir, 
            DATA), exts=('.train.src', '.train.trg'), fields=(TEXT, TEXT))
        MIN_FREQ = 2
        TEXT.build_vocab(train.src, min_freq=MIN_FREQ)

    #########################
    #  Save in count order  #
    #########################

    ordered_words = [word for word, _ in TEXT.vocab.freqs.most_common()]
    with open(vocab_freq_file, 'w') as f:
        print('Writing...')
        f.write('<s>\n</s>\n\n')

        for word in ordered_words:
            f.write(word + '\n')
github salesforce / matchbox / examples / transformer.py View on Github external
for src, trg in zip(batch.src.examples(), batch.trg.examples()):
                logits = self(src, trg[:, :-1])
                loss += F.cross_entropy(logits, trg[:, 1:], reduce=reduce)
            return loss
        logits = self(batch.src, batch.trg[:, :-1])
        return F.cross_entropy(logits, batch.trg[:, 1:], reduce=reduce)

if __name__ == '__main__':
    import sys
    unbatch = sys.argv[1] == '1'
    small = sys.argv[2] == '1'
    if sys.argv[3] == '1':
        TEXT = data.Field(batch_first=True)
    else:
        TEXT = MaskedBatchField(batch_first=True)
    train, dev, test = datasets.IWSLT.splits(('.de', '.en'), (TEXT, TEXT))
    TEXT.build_vocab(train, max_size=50000)
    random.seed(0)
    torch.manual_seed(0)
    train_iter = data.BucketIterator(
        train, batch_size=32, device=0 if torch.cuda.is_available() else -1)
    args = argparse.Namespace()
    args.__dict__.update(d_model=8 if small else 512,
                         d_hidden=1 if small else 2048,
                         n_heads=8, drop_ratio=0,
                         n_layers=6, length_ratio=1.5)
    model = Transformer(TEXT, TEXT, args)
    if torch.cuda.is_available(): model.cuda()
    for i, b in enumerate(train_iter):
        if i == 1:
            t = time.time()
        if i == 2:
github wabyking / TextClassificationBenchmark / utils.py View on Github external
def loadData(opt):
    if not opt.from_torchtext:
        import dataHelper as helper
        return helper.loadData(opt)
    device = 0 if  torch.cuda.is_available()  else -1

    TEXT = data.Field(lower=True, include_lengths=True, batch_first=True,fix_length=opt.max_seq_len)
    LABEL = data.Field(sequential=False)
    if opt.dataset=="imdb":
        train, test = datasets.IMDB.splits(TEXT, LABEL)
    elif opt.dataset=="sst":
        train, val, test = datasets.SST.splits( TEXT, LABEL, fine_grained=True, train_subtrees=True,
                                               filter_pred=lambda ex: ex.label != 'neutral')
    elif opt.dataset=="trec":
        train, test = datasets.TREC.splits(TEXT, LABEL, fine_grained=True)
    else:
        print("does not support this datset")
        
    TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
    LABEL.build_vocab(train)    
    # print vocab information
    print('len(TEXT.vocab)', len(TEXT.vocab))
    print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())

    train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=opt.batch_size,device=device,repeat=False,shuffle=True)
github michaelchen110 / Grammar-Correction / transformer / transformer_train.py View on Github external
#   Data Loading    #
    #####################
    BOS_WORD = '<s>'
    EOS_WORD = '</s>'
    BLANK_WORD = ""
    MIN_FREQ = 2

    spacy_en = spacy.load('en')
    def tokenize_en(text):
        return [tok.text for tok in spacy_en.tokenizer(text)]
    TEXT = data.Field(tokenize=tokenize_en, init_token = BOS_WORD,
                     eos_token = EOS_WORD, pad_token=BLANK_WORD)

    train = datasets.TranslationDataset(path=os.path.join(SRC_DIR, DATA),
            exts=('.train.src', '.train.trg'), fields=(TEXT, TEXT))
    val = datasets.TranslationDataset(path=os.path.join(SRC_DIR, DATA), 
            exts=('.val.src', '.val.trg'), fields=(TEXT, TEXT))

    train_iter = MyIterator(train, batch_size=BATCH_SIZE, device=device,
                            repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
                            batch_size_fn=batch_size_fn, train=True)
    valid_iter = MyIterator(val, batch_size=BATCH_SIZE, device=device,
                            repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
                            batch_size_fn=batch_size_fn, train=False)

    random_idx = random.randint(0, len(train) - 1)
    print(train[random_idx].src)
    print(train[random_idx].trg)

    ###############
    #  Vocabuary  #
    ###############
github wabyking / TextClassificationBenchmark / dataloader / torch_text_demo / trec.py View on Github external
from torchtext import datasets
from torchtext.vocab import GloVe, CharNGram
import torch
if not torch.cuda.is_available() :
    device = -1
else:
    device = 0

# Approach 1:
# set up fields
TEXT = data.Field(lower=True, include_lengths=True, batch_first=True)
LABEL = data.Field(sequential=False)


# make splits for data
train, test = datasets.TREC.splits(TEXT, LABEL, fine_grained=True)

# print information about the data
print('train.fields', train.fields)
print('len(train)', len(train))
print('vars(train[0])', vars(train[0]))

# build the vocabulary
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
LABEL.build_vocab(train)

# print vocab information
print('len(TEXT.vocab)', len(TEXT.vocab))
print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())

# make iterator for splits
train_iter, test_iter = data.BucketIterator.splits(
github Chanrom / sentence-representation-for-classification-PyTorch / train.py View on Github external
print '  initialize with pretrained vectors: %s' % config['pretrained']
        print '  number of classes: %d' % num_classes
        print '  number of tokens: %d' % num_tokens
        print '  max batch size: %d' % config['batch_size']

        # sort_within_batch is set True because we may use nn.LSTM
        train_loader, valid_loader = data.BucketIterator.splits(
            datasets=dataset_splits, batch_size=config['batch_size'],
            sort_within_batch=True, device=config['gpu'])

    elif config['dataset'] == 'SST':
        filter_pred = None
        if not config['fine_grained']:
            filter_pred = lambda ex: ex.label != 'neutral'

        dataset_splits = datasets.SST.splits(
            root='data', text_field=text_field, label_field=label_field,
            fine_grained=config['fine_grained'], filter_pred=filter_pred)

        text_field.build_vocab(*dataset_splits, vectors=config['pretrained'])
        label_field.build_vocab(*dataset_splits)
        num_tokens = len(text_field.vocab)
        num_classes = len(label_field.vocab)
        PAD_ID = text_field.vocab.stoi['']
        dataset_info = {'num_tokens':num_tokens,
                        'num_classes':num_classes,
                        'PAD_ID':PAD_ID}

        print '  initialize with pretrained vectors: %s' % config['pretrained']
        print '  number of classes: %d' % num_classes
        print '  number of tokens: %d' % num_tokens
        print '  max batch size: %d' % config['batch_size']
github andreamad8 / Universal-Transformer-Pytorch / main.py View on Github external
def main(config):
    vocab_len = get_babi_vocab(config.task)
    train_iter, val_iter, test_iter = datasets.BABI20.iters(batch_size=config.batch_size, 
                                                            root='.data', 
                                                            memory_size=70, 
                                                            task=config.task, 
                                                            joint=False,
                                                            tenK=False, 
                                                            only_supporting=False, 
                                                            sort=False, 
                                                            shuffle=True)
    model = BabiUTransformer(num_vocab=vocab_len, 
                    embedding_size=config.emb, 
                    hidden_size=config.emb, 
                    num_layers=config.max_hops,
                    num_heads=config.heads, 
                    total_key_depth=config.depth, 
                    total_value_depth=config.depth,
                    filter_size=config.filter,