How to use the torchtext.data.TabularDataset function in torchtext

To help you get started, we’ve selected a few torchtext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pytorch / text / test / data / test_dataset.py View on Github external
def test_json_valid_and_invalid_nested_key(self):
        self.write_test_nested_key_json_dataset()
        valid_fields = {'foods.vegetables.name': ('vegs', data.Field()),
                        'foods.fruits': ('fruits', data.Field())}
        invalid_fields = {'foods.vegetables.color': ('vegs', data.Field())}

        expected_examples = [
            {"fruits": ["Apple", "Banana"],
             "vegs": ["Broccoli", "Cabbage"]},
            {"fruits": ["Cherry", "Grape", "Lemon"],
             "vegs": ["Cucumber", "Lettuce"]},
            {"fruits": ["Orange", "Pear", "Strawberry"],
             "vegs": ["Marrow", "Spinach"]}
        ]
        dataset = data.TabularDataset(
            path=self.test_nested_key_json_dataset_path,
            format="json",
            fields=valid_fields)
        # check results
        for example, expect in zip(dataset.examples, expected_examples):
            self.assertEqual(example.vegs, expect['vegs'])
            self.assertEqual(example.fruits, expect['fruits'])

        with self.assertRaises(ValueError):
            data.TabularDataset(
                path=self.test_nested_key_json_dataset_path,
                format="json",
                fields=invalid_fields)
github pytorch / text / test / data / test_batch.py View on Github external
def test_batch_iter(self):
        self.write_test_numerical_features_dataset()
        FLOAT = data.Field(use_vocab=False, sequential=False,
                           dtype=torch.float)
        INT = data.Field(use_vocab=False, sequential=False, is_target=True)
        TEXT = data.Field(sequential=False)

        dst = data.TabularDataset(path=self.test_numerical_features_dataset_path,
                                  format="tsv", skip_header=False,
                                  fields=[("float", FLOAT),
                                          ("int", INT),
                                          ("text", TEXT)])
        TEXT.build_vocab(dst)
        itr = data.Iterator(dst, batch_size=2, device=-1, shuffle=False)
        fld_order = [k for k, v in dst.fields.items() if
                     v is not None and not v.is_target]
        batch = next(iter(itr))
        (x1, x2), y = batch
        x = (x1, x2)[fld_order.index("float")]
        self.assertEquals(y.data[0], 1)
        self.assertEquals(y.data[1], 12)
        self.assertAlmostEqual(x.data[0], 0.1, places=4)
        self.assertAlmostEqual(x.data[1], 0.5, places=4)
github mgrenander / reproducing-paulus-xiong-socher / trainer.py View on Github external
# PREPROCESSING
    ###############################
    datasets = ["train", "val", "test"]
    for dataset in datasets:
        if not os.path.exists(os.path.join("data", dataset + ".tsv")):
            print("Creating TSV for " + dataset)
            convert_to_tsv(dataset)

    print("Creating datasets", end='', flush=True)
    curr_time = datetime.now()

    article_field = data.ReversibleField(tensor_type=torch.cuda.LongTensor, lower=True, tokenize=tokenizer_in)
    summary_field = data.ReversibleField(tensor_type=torch.cuda.LongTensor, lower=True, tokenize=tokenizer_out, init_token='')

    train_set = data.TabularDataset(path='./data/train.tsv', format='tsv', fields=[('article', article_field), ('summary', summary_field)])
    val_set = data.TabularDataset(path='./data/val.tsv', format='tsv', fields=[('article', article_field), ('summary', summary_field)])

    diff_time, curr_time = get_time_diff(curr_time)
    print(", took {} min".format(diff_time))

    print("Building vocabulary and creating batches", end='', flush=True)
    article_field.build_vocab(train_set, vectors="glove.6B.100d", max_size=encoder_vocab_size)
    summary_field.build_vocab(train_set, max_size=decoder_vocab_size)

    train_iter = data.BucketIterator(dataset=train_set, batch_size=batch_size, sort_key=lambda x: len(x.article), repeat=False, device=DEVICE)
    val_iter = data.BucketIterator(dataset=val_set, batch_size=batch_size, sort_key=lambda x: len(x.article), repeat=False, device=DEVICE)

    diff_time, curr_time = get_time_diff(curr_time)
    print(", took {} min".format(diff_time))
    ###############################
    # MODEL CREATION
    ###############################
github mgrenander / reproducing-paulus-xiong-socher / trainer.py View on Github external
print("Beginning training")
    tqdm_epoch = tqdm(range(num_epochs), desc="Epoch")
    for epoch in tqdm_epoch:
        train_iter.init_epoch()
        tqdm_batch = tqdm(train_iter, desc="Batch")
        for b_id, batch in enumerate(tqdm_batch):
            encoder.batch_size = batch.batch_size  # Fixes weird bug where we get batch sizes that are not batch_size
            decoder.batch_size = batch.batch_size
            avg_loss = train(batch, encoder, decoder, encoder_opt, decoder_opt, loss_func, teacher_forcing_ratio)

    ###############################
    # TESTING
    ###############################
    # Load test set
    print("Loading test set")
    test_set = data.TabularDataset(path='./data/test.tsv', format='tsv', fields=[('article', article_field), ('summary', summary_field)])
    test_iter = data.BucketIterator(dataset=test_set, batch_size=batch_size, sort_key=lambda x: len(x.article), repeat=False, device=DEVICE)
    print("Evaluating model")
    evaluate(encoder=encoder, decoder=decoder, dataset=test_iter, rev_field=article_field)
github xhuang31 / KEQA_WSDM19 / train_detection.py View on Github external
random.seed(args.seed)
torch.backends.cudnn.deterministic = True

if not args.cuda:
    args.gpu = -1
if torch.cuda.is_available() and args.cuda:
    print("Note: You are using GPU for training")
    torch.cuda.set_device(args.gpu)
    torch.cuda.manual_seed(args.seed)
if torch.cuda.is_available() and not args.cuda:
    print("Warning: You have Cuda but not use it. You are using CPU for training.")

# Set up the data for training
TEXT = data.Field(lower=True)
ED = data.Field()
train = data.TabularDataset(path=os.path.join(args.output, 'dete_train.txt'), format='tsv', fields=[('text', TEXT), ('ed', ED)])
field = [('id', None), ('sub', None), ('entity', None), ('relation', None), ('obj', None), ('text', TEXT), ('ed', ED)]
dev, test = data.TabularDataset.splits(path=args.output, validation='valid.txt', test='test.txt', format='tsv', fields=field)
TEXT.build_vocab(train, dev, test)
ED.build_vocab(train, dev)

match_embedding = 0
if os.path.isfile(args.vector_cache):
    stoi, vectors, dim = torch.load(args.vector_cache)
    TEXT.vocab.vectors = torch.Tensor(len(TEXT.vocab), dim)
    for i, token in enumerate(TEXT.vocab.itos):
        wv_index = stoi.get(token, None)
        if wv_index is not None:
            TEXT.vocab.vectors[i] = vectors[wv_index]
            match_embedding += 1
        else:
            TEXT.vocab.vectors[i] = torch.FloatTensor(dim).uniform_(-0.25, 0.25)
github i-machine-think / machine / machine_task_lookup_example.py View on Github external
tgt = TargetField(include_eos=False)
    tabular_data_fields = [('src', src), ('tgt', tgt)]

    max_len = parameters['max_len']

    def len_filter(example):
        return len(example.src) <= max_len and len(example.tgt) <= max_len

    # generate training and testing data
    train = torchtext.data.TabularDataset(
        path=train_path, format='tsv',
        fields=tabular_data_fields,
        filter_pred=len_filter
    )

    dev = torchtext.data.TabularDataset(
        path=valid_path, format='tsv',
        fields=tabular_data_fields,
        filter_pred=len_filter
    )

    monitor_data = OrderedDict()
    for dataset in test_paths:
        m = torchtext.data.TabularDataset(
            path=dataset, format='tsv',
            fields=tabular_data_fields,
            filter_pred=len_filter)
        monitor_data[dataset] = m

    return src, tgt, train, dev, monitor_data
github songyingxin / TextClassification-Pytorch / Utils / SST2_utils.py View on Github external
def load_sst2(path, text_field, label_field, batch_size, device, embedding_file, cache_dir):

    train, dev, test = data.TabularDataset.splits(
        path=path, train='train.tsv', validation='dev.tsv',
        test='test.tsv', format='tsv', skip_header=True,
        fields=[('text', text_field), ('label', label_field)])
    print("the size of train: {}, dev:{}, test:{}".format(
        len(train.examples), len(dev.examples), len(test.examples)))
    vectors = vocab.Vectors(embedding_file, cache_dir)

    text_field.build_vocab(
        train, dev, test, max_size=25000,
        vectors=vectors, unk_init=torch.Tensor.normal_)
    label_field.build_vocab(train, dev, test)

    train_iter, dev_iter, test_iter = data.BucketIterator.splits(
        (train, dev, test), batch_sizes=(batch_size, len(dev), len(test)), sort_key=lambda x: len(x.text), sort_within_batch=True, repeat=False, shuffle=True, device=device
    )
github bkj / pmet / pmet / data.py View on Github external
def load_test_dataset(data_path, VALS, device=0):
    
    data_path = os.path.abspath(data_path)
    
    dataset = data.TabularDataset(
        path=data_path,
        format='tsv',
        fields=[
            ("val", VALS),
        ])
    
    return data.Iterator(
        dataset=dataset,
        batch_size=1,
        device=device,
        shuffle=False,
        sort=False,
        repeat=False,
    )
github INK-USC / DualRE / train.py View on Github external
FIELDS = {
    'tokens': ('token', TOKEN),
    'stanford_pos': ('pos', POS),
    'stanford_ner': ('ner', NER),
    'relation': ('relation', RELATION),
    'subj_pst': ('subj_pst', PST),
    'obj_pst': ('obj_pst', PST),
    'pr_confidence': ('pr_confidence', PR_CONFIDENCE),
    'sl_confidence': ('sl_confidence', SL_CONFIDENCE)
}
dataset_vocab = data.TabularDataset(
    path=opt['data_dir'] + '/train-' + str(opt['labeled_ratio']) + '.json',
    format='json',
    fields=FIELDS)
dataset_train = data.TabularDataset(
    path=opt['data_dir'] + '/train-' + str(opt['labeled_ratio']) + '.json',
    format='json',
    fields=FIELDS)
dataset_infer = data.TabularDataset(
    path=opt['data_dir'] + '/raw-' + str(opt['unlabeled_ratio']) + '.json',
    format='json',
    fields=FIELDS)
dataset_dev = data.TabularDataset(path=opt['data_dir'] + '/dev.json', format='json', fields=FIELDS)
dataset_test = data.TabularDataset(
    path=opt['data_dir'] + '/test.json', format='json', fields=FIELDS)

print('=' * 100)
print('Labeled data path: ' + opt['data_dir'] + '/train-' + str(opt['labeled_ratio']) + '.json')
print('Unlabeled data path: ' + opt['data_dir'] + '/raw-' + str(opt['unlabeled_ratio']) + '.json')
print('Labeled instances #: %d, Unlabeled instances #: %d' % (len(dataset_train.examples),
                                                              len(dataset_infer.examples)))