Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def read_data(config):
args = Field(lower=True, tokenize='spacy') if config.compositional_args else Field()
rels = Field(lower=True, tokenize='spacy') if config.relational_args else Field()
#TODO we will need to add a header to the files
data = TabularDataset(path=config.data_path, format='tsv', fields = [('subject', args), ('relation', rels), ('object', args)])
train, dev = data.split(split_ratio=0.99)
print('Train size:', len(train), ' Dev size:', len(dev))
args.build_vocab(train)
rels.build_vocab(train)
config.n_args = len(args.vocab)
config.n_rels = len(rels.vocab)
print("#Args:", config.n_args, " #Rels:", config.n_rels)
train_iter, dev_iter = Iterator.splits((train, dev), batch_size=config.batch_size, device=args.gpu)
train_iter.repeat = False
#TODO need to figure out how to duplicate the relations field, and then detach it from the regular order. This'll allow us to effectively sample relations.
return train_iter, dev_iter
def mr(text_field, label_field, **kargs):
train_data, dev_data = mydatasets.MR.splits(text_field, label_field)
text_field.build_vocab(train_data, dev_data)
label_field.build_vocab(train_data, dev_data)
train_iter, dev_iter = data.Iterator.splits(
(train_data, dev_data),
batch_sizes=(args.batch_size, len(dev_data)),
**kargs)
return train_iter, dev_iter
def mrs_two_mui(path, train_name, dev_name, test_name, char_data, text_field, label_field, static_text_field, static_label_field, **kargs):
train_data, dev_data, test_data = mydatasets_self_two.MR.splits(path, train_name, dev_name, test_name,
char_data, text_field, label_field)
static_train_data, static_dev_data, static_test_data = mydatasets_self_two.MR.splits(path, train_name, dev_name,
test_name,
char_data, static_text_field,
static_label_field)
print("len(train_data) {} ".format(len(train_data)))
print("len(train_data) {} ".format(len(static_train_data)))
text_field.build_vocab(train_data, min_freq=args.min_freq)
label_field.build_vocab(train_data)
static_text_field.build_vocab(static_train_data, static_dev_data, static_test_data, min_freq=args.min_freq)
static_label_field.build_vocab(static_train_data, static_dev_data, static_test_data)
train_iter, dev_iter, test_iter = data.Iterator.splits(
(train_data, dev_data, test_data),
batch_sizes=(args.batch_size,
len(dev_data),
len(test_data)),
**kargs)
return train_iter, dev_iter, test_iter
def load_dataset(text_field, label_field, args, **kwargs):
train_dataset, dev_dataset = dataset.get_dataset('data', text_field, label_field)
if args.static and args.pretrained_name and args.pretrained_path:
vectors = load_word_vectors(args.pretrained_name, args.pretrained_path)
text_field.build_vocab(train_dataset, dev_dataset, vectors=vectors)
else:
text_field.build_vocab(train_dataset, dev_dataset)
label_field.build_vocab(train_dataset, dev_dataset)
train_iter, dev_iter = data.Iterator.splits(
(train_dataset, dev_dataset),
batch_sizes=(args.batch_size, len(dev_dataset)),
sort_key=lambda x: len(x.text),
**kwargs)
return train_iter, dev_iter
def mrs_five(text_field, label_field, **kargs):
train_data, dev_data, test_data = mydatasets_self_five.MR.splits(text_field, label_field)
print("len(train_data) {} ".format(len(train_data)))
text_field.build_vocab(train_data)
label_field.build_vocab(train_data)
train_iter, dev_iter, test_iter = data.Iterator.splits(
(train_data, dev_data, test_data),
batch_sizes=(args.batch_size,
len(dev_data),
len(test_data)),
**kargs)
return train_iter, dev_iter, test_iter
def mrs_five_mui(path, train_name, dev_name, test_name, char_data, text_field, label_field, static_text_field,
static_label_field, **kargs):
train_data, dev_data, test_data = mydatasets_self_five.MR.splits(path, train_name, dev_name, test_name,
char_data, text_field, label_field)
static_train_data, static_dev_data, static_test_data = mydatasets_self_five.MR.splits(path, train_name, dev_name,
test_name,
char_data,
static_text_field,
static_label_field)
print("len(train_data) {} ".format(len(train_data)))
print("len(train_data) {} ".format(len(static_train_data)))
text_field.build_vocab(train_data, min_freq=args.min_freq)
label_field.build_vocab(train_data)
static_text_field.build_vocab(static_train_data, static_dev_data, static_test_data, min_freq=args.min_freq)
static_label_field.build_vocab(static_train_data, static_dev_data, static_test_data)
train_iter, dev_iter, test_iter = data.Iterator.splits(
(train_data, dev_data, test_data),
batch_sizes=(args.batch_size,
len(dev_data),
len(test_data)),
**kargs)
return train_iter, dev_iter, test_iter
batch_first = True)
label = data.Field(lower = True)
label_pred = data.Field(use_vocab = False, fix_length = 1)
fname = data.Field(use_vocab = False, fix_length = 1)
train, valid, test = IMDB_modified.splits(text, label, label_pred, fname,
root = root, model_name = args.model_name,
load_pred = args.load_pred)
print("build vocab...")
text.build_vocab(train, vectors = GloVe(name = '6B',
dim = embedding_dim,
cache = root), max_size = max_total_num_words)
label.build_vocab(train)
print("Create Iterator objects for multiple splits of a dataset...")
train_loader, valid_loader, test_loader = data.Iterator.splits((train, valid, test),
batch_size = batch_size,
device = device,
repeat = False)
data_loader['word_idx'] = text.vocab.itos
data_loader['x_type'] = torch.cuda.LongTensor if args.cuda else torch.LongTensor
data_loader['y_type'] = torch.cuda.LongTensor if args.cuda else torch.LongTensor
data_loader['max_total_num_words'] = max_total_num_words
data_loader['embedding_dim'] = embedding_dim
data_loader['max_num_words'] = 50
data_loader['max_num_sents'] = int(next(iter(train_loader)).text.size(-1) / data_loader['max_num_words'])
else : raise UnknownDatasetError()
data_loader['train'] = train_loader
data_loader['valid'] = valid_loader
def mrs_two_mui(path, train_name, dev_name, test_name, char_data, text_field, label_field, static_text_field, static_label_field, **kargs):
train_data, dev_data, test_data = mydatasets_self_two.MR.splits(path, train_name, dev_name, test_name,
char_data, text_field, label_field)
static_train_data, static_dev_data, static_test_data = mydatasets_self_two.MR.splits(path, train_name, dev_name,
test_name,
char_data, static_text_field,
static_label_field)
print("len(train_data) {} ".format(len(train_data)))
print("len(train_data) {} ".format(len(static_train_data)))
text_field.build_vocab(train_data, min_freq=args.min_freq)
label_field.build_vocab(train_data)
static_text_field.build_vocab(static_train_data, static_dev_data, static_test_data, min_freq=args.min_freq)
static_label_field.build_vocab(static_train_data, static_dev_data, static_test_data)
train_iter, dev_iter, test_iter = data.Iterator.splits(
(train_data, dev_data, test_data),
batch_sizes=(args.batch_size,
len(dev_data),
len(test_data)),
**kargs)
return train_iter, dev_iter, test_iter
def mrs_two(text_field, label_field, **kargs):
train_data, dev_data, test_data = mydatasets_self_two.MR.splits(text_field, label_field)
print("len(train_data) {} ".format(len(train_data)))
text_field.build_vocab(train_data)
label_field.build_vocab(train_data)
train_iter, dev_iter, test_iter = data.Iterator.splits(
(train_data, dev_data, test_data),
batch_sizes=(args.batch_size,
len(dev_data),
len(test_data)),
**kargs)
return train_iter, dev_iter, test_iter