Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_json_valid_and_invalid_nested_key(self):
self.write_test_nested_key_json_dataset()
valid_fields = {'foods.vegetables.name': ('vegs', data.Field()),
'foods.fruits': ('fruits', data.Field())}
invalid_fields = {'foods.vegetables.color': ('vegs', data.Field())}
expected_examples = [
{"fruits": ["Apple", "Banana"],
"vegs": ["Broccoli", "Cabbage"]},
{"fruits": ["Cherry", "Grape", "Lemon"],
"vegs": ["Cucumber", "Lettuce"]},
{"fruits": ["Orange", "Pear", "Strawberry"],
"vegs": ["Marrow", "Spinach"]}
]
dataset = data.TabularDataset(
path=self.test_nested_key_json_dataset_path,
format="json",
fields=valid_fields)
# check results
for example, expect in zip(dataset.examples, expected_examples):
self.assertEqual(example.vegs, expect['vegs'])
self.assertEqual(example.fruits, expect['fruits'])
with self.assertRaises(ValueError):
data.TabularDataset(
path=self.test_nested_key_json_dataset_path,
format="json",
fields=invalid_fields)
def test_batch_iter(self):
self.write_test_numerical_features_dataset()
FLOAT = data.Field(use_vocab=False, sequential=False,
dtype=torch.float)
INT = data.Field(use_vocab=False, sequential=False, is_target=True)
TEXT = data.Field(sequential=False)
dst = data.TabularDataset(path=self.test_numerical_features_dataset_path,
format="tsv", skip_header=False,
fields=[("float", FLOAT),
("int", INT),
("text", TEXT)])
TEXT.build_vocab(dst)
itr = data.Iterator(dst, batch_size=2, device=-1, shuffle=False)
fld_order = [k for k, v in dst.fields.items() if
v is not None and not v.is_target]
batch = next(iter(itr))
(x1, x2), y = batch
x = (x1, x2)[fld_order.index("float")]
self.assertEquals(y.data[0], 1)
self.assertEquals(y.data[1], 12)
self.assertAlmostEqual(x.data[0], 0.1, places=4)
self.assertAlmostEqual(x.data[1], 0.5, places=4)
# PREPROCESSING
###############################
datasets = ["train", "val", "test"]
for dataset in datasets:
if not os.path.exists(os.path.join("data", dataset + ".tsv")):
print("Creating TSV for " + dataset)
convert_to_tsv(dataset)
print("Creating datasets", end='', flush=True)
curr_time = datetime.now()
article_field = data.ReversibleField(tensor_type=torch.cuda.LongTensor, lower=True, tokenize=tokenizer_in)
summary_field = data.ReversibleField(tensor_type=torch.cuda.LongTensor, lower=True, tokenize=tokenizer_out, init_token='')
train_set = data.TabularDataset(path='./data/train.tsv', format='tsv', fields=[('article', article_field), ('summary', summary_field)])
val_set = data.TabularDataset(path='./data/val.tsv', format='tsv', fields=[('article', article_field), ('summary', summary_field)])
diff_time, curr_time = get_time_diff(curr_time)
print(", took {} min".format(diff_time))
print("Building vocabulary and creating batches", end='', flush=True)
article_field.build_vocab(train_set, vectors="glove.6B.100d", max_size=encoder_vocab_size)
summary_field.build_vocab(train_set, max_size=decoder_vocab_size)
train_iter = data.BucketIterator(dataset=train_set, batch_size=batch_size, sort_key=lambda x: len(x.article), repeat=False, device=DEVICE)
val_iter = data.BucketIterator(dataset=val_set, batch_size=batch_size, sort_key=lambda x: len(x.article), repeat=False, device=DEVICE)
diff_time, curr_time = get_time_diff(curr_time)
print(", took {} min".format(diff_time))
###############################
# MODEL CREATION
###############################
print("Beginning training")
tqdm_epoch = tqdm(range(num_epochs), desc="Epoch")
for epoch in tqdm_epoch:
train_iter.init_epoch()
tqdm_batch = tqdm(train_iter, desc="Batch")
for b_id, batch in enumerate(tqdm_batch):
encoder.batch_size = batch.batch_size # Fixes weird bug where we get batch sizes that are not batch_size
decoder.batch_size = batch.batch_size
avg_loss = train(batch, encoder, decoder, encoder_opt, decoder_opt, loss_func, teacher_forcing_ratio)
###############################
# TESTING
###############################
# Load test set
print("Loading test set")
test_set = data.TabularDataset(path='./data/test.tsv', format='tsv', fields=[('article', article_field), ('summary', summary_field)])
test_iter = data.BucketIterator(dataset=test_set, batch_size=batch_size, sort_key=lambda x: len(x.article), repeat=False, device=DEVICE)
print("Evaluating model")
evaluate(encoder=encoder, decoder=decoder, dataset=test_iter, rev_field=article_field)
random.seed(args.seed)
torch.backends.cudnn.deterministic = True
if not args.cuda:
args.gpu = -1
if torch.cuda.is_available() and args.cuda:
print("Note: You are using GPU for training")
torch.cuda.set_device(args.gpu)
torch.cuda.manual_seed(args.seed)
if torch.cuda.is_available() and not args.cuda:
print("Warning: You have Cuda but not use it. You are using CPU for training.")
# Set up the data for training
TEXT = data.Field(lower=True)
ED = data.Field()
train = data.TabularDataset(path=os.path.join(args.output, 'dete_train.txt'), format='tsv', fields=[('text', TEXT), ('ed', ED)])
field = [('id', None), ('sub', None), ('entity', None), ('relation', None), ('obj', None), ('text', TEXT), ('ed', ED)]
dev, test = data.TabularDataset.splits(path=args.output, validation='valid.txt', test='test.txt', format='tsv', fields=field)
TEXT.build_vocab(train, dev, test)
ED.build_vocab(train, dev)
match_embedding = 0
if os.path.isfile(args.vector_cache):
stoi, vectors, dim = torch.load(args.vector_cache)
TEXT.vocab.vectors = torch.Tensor(len(TEXT.vocab), dim)
for i, token in enumerate(TEXT.vocab.itos):
wv_index = stoi.get(token, None)
if wv_index is not None:
TEXT.vocab.vectors[i] = vectors[wv_index]
match_embedding += 1
else:
TEXT.vocab.vectors[i] = torch.FloatTensor(dim).uniform_(-0.25, 0.25)
tgt = TargetField(include_eos=False)
tabular_data_fields = [('src', src), ('tgt', tgt)]
max_len = parameters['max_len']
def len_filter(example):
return len(example.src) <= max_len and len(example.tgt) <= max_len
# generate training and testing data
train = torchtext.data.TabularDataset(
path=train_path, format='tsv',
fields=tabular_data_fields,
filter_pred=len_filter
)
dev = torchtext.data.TabularDataset(
path=valid_path, format='tsv',
fields=tabular_data_fields,
filter_pred=len_filter
)
monitor_data = OrderedDict()
for dataset in test_paths:
m = torchtext.data.TabularDataset(
path=dataset, format='tsv',
fields=tabular_data_fields,
filter_pred=len_filter)
monitor_data[dataset] = m
return src, tgt, train, dev, monitor_data
def load_sst2(path, text_field, label_field, batch_size, device, embedding_file, cache_dir):
train, dev, test = data.TabularDataset.splits(
path=path, train='train.tsv', validation='dev.tsv',
test='test.tsv', format='tsv', skip_header=True,
fields=[('text', text_field), ('label', label_field)])
print("the size of train: {}, dev:{}, test:{}".format(
len(train.examples), len(dev.examples), len(test.examples)))
vectors = vocab.Vectors(embedding_file, cache_dir)
text_field.build_vocab(
train, dev, test, max_size=25000,
vectors=vectors, unk_init=torch.Tensor.normal_)
label_field.build_vocab(train, dev, test)
train_iter, dev_iter, test_iter = data.BucketIterator.splits(
(train, dev, test), batch_sizes=(batch_size, len(dev), len(test)), sort_key=lambda x: len(x.text), sort_within_batch=True, repeat=False, shuffle=True, device=device
)
def load_test_dataset(data_path, VALS, device=0):
data_path = os.path.abspath(data_path)
dataset = data.TabularDataset(
path=data_path,
format='tsv',
fields=[
("val", VALS),
])
return data.Iterator(
dataset=dataset,
batch_size=1,
device=device,
shuffle=False,
sort=False,
repeat=False,
)
FIELDS = {
'tokens': ('token', TOKEN),
'stanford_pos': ('pos', POS),
'stanford_ner': ('ner', NER),
'relation': ('relation', RELATION),
'subj_pst': ('subj_pst', PST),
'obj_pst': ('obj_pst', PST),
'pr_confidence': ('pr_confidence', PR_CONFIDENCE),
'sl_confidence': ('sl_confidence', SL_CONFIDENCE)
}
dataset_vocab = data.TabularDataset(
path=opt['data_dir'] + '/train-' + str(opt['labeled_ratio']) + '.json',
format='json',
fields=FIELDS)
dataset_train = data.TabularDataset(
path=opt['data_dir'] + '/train-' + str(opt['labeled_ratio']) + '.json',
format='json',
fields=FIELDS)
dataset_infer = data.TabularDataset(
path=opt['data_dir'] + '/raw-' + str(opt['unlabeled_ratio']) + '.json',
format='json',
fields=FIELDS)
dataset_dev = data.TabularDataset(path=opt['data_dir'] + '/dev.json', format='json', fields=FIELDS)
dataset_test = data.TabularDataset(
path=opt['data_dir'] + '/test.json', format='json', fields=FIELDS)
print('=' * 100)
print('Labeled data path: ' + opt['data_dir'] + '/train-' + str(opt['labeled_ratio']) + '.json')
print('Unlabeled data path: ' + opt['data_dir'] + '/raw-' + str(opt['unlabeled_ratio']) + '.json')
print('Labeled instances #: %d, Unlabeled instances #: %d' % (len(dataset_train.examples),
len(dataset_infer.examples)))