Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
from torchtext import data
TEXT = data.Field()
LABELS = data.Field()
train, val, test = data.TabularDataset.splits(
path='~/chainer-research/jmt-data/pos_wsj/pos_wsj', train='.train',
validation='.dev', test='.test', format='tsv',
fields=[('text', TEXT), ('labels', LABELS)])
print(train.fields)
print(len(train))
print(vars(train[0]))
train_iter, val_iter, test_iter = data.BucketIterator.splits(
(train, val, test), batch_size=3, sort_key=lambda x: len(x.text), device="cuda:0")
LABELS.build_vocab(train.labels)
TEXT.build_vocab(train.text)
print(TEXT.vocab.freqs.most_common(10))
print(LABELS.vocab.itos)
def generate_test(data_path="../data_processed"):
BOS_WORD = '<s>'
EOS_WORD = '</s>'
BLANK_WORD = ""
SRC = data.Field(tokenize=tokenize_vi, pad_token=BLANK_WORD)
TGT = data.Field(tokenize=tokenize_en, init_token = BOS_WORD,
eos_token = EOS_WORD, pad_token=BLANK_WORD)
MAX_LEN = 100
train, val, test = data.TabularDataset.splits(
path=data_path, train='train.tsv', test='test2013.tsv',
validation='dev.tsv', fields=[('src',SRC), ('trg',TGT)],
format='tsv', filter_pred=mytestfilter)
MIN_FREQ = 2
SRC.build_vocab(train.src, min_freq=MIN_FREQ)
TGT.build_vocab(train.trg, min_freq=MIN_FREQ)
return (SRC, TGT, train, val, test)
BIDIRECTIONAL = True
DROUPOUT = 0.5
NUM_EPOCHS = 20
####################################
# Preparing Data #
####################################
# 1. data.Field()
TEXT = data.Field(include_lengths=True, pad_token='', unk_token='')
TAG_LABEL = data.LabelField()
AGE_LABEL = data.LabelField()
GENDER_LABEL = data.LabelField()
# 2. data.TabularDataset
train_data, test_data = data.TabularDataset.splits(path=TrustPilot_processed_dataset_path,
train="train.csv",
test="test.csv",
fields=[('text', TEXT), ('tag_label', TAG_LABEL),
('age_label', AGE_LABEL), ('gender_label', GENDER_LABEL)],
format="csv")
# 3. Split train_data to train_data, valid_data
train_data, valid_data = train_data.split(random_state=random.seed(SEED))
print("Number of train_data = {}".format(len(train_data)))
print("Number of valid_data = {}".format(len(valid_data)))
print("Number of test_data = {}\n".format(len(test_data)))
# 4. data.BucketIterator
train_iter, valid_iter, test_iter = data.BucketIterator.splits((train_data, valid_data, test_data),
batch_size=BATCH_SIZE,
device=device,
if not args.cuda:
args.gpu = -1
if torch.cuda.is_available() and args.cuda:
print("Note: You are using GPU for training")
torch.cuda.set_device(args.gpu)
torch.cuda.manual_seed(args.seed)
if torch.cuda.is_available() and not args.cuda:
print("Warning: You have Cuda but not use it. You are using CPU for training.")
# Set up the data for training
TEXT = data.Field(lower=True)
ED = data.Field()
train = data.TabularDataset(path=os.path.join(args.output, 'dete_train.txt'), format='tsv', fields=[('text', TEXT), ('ed', ED)])
field = [('id', None), ('sub', None), ('entity', None), ('relation', None), ('obj', None), ('text', TEXT), ('ed', ED)]
dev, test = data.TabularDataset.splits(path=args.output, validation='valid.txt', test='test.txt', format='tsv', fields=field)
TEXT.build_vocab(train, dev, test)
ED.build_vocab(train, dev)
match_embedding = 0
if os.path.isfile(args.vector_cache):
stoi, vectors, dim = torch.load(args.vector_cache)
TEXT.vocab.vectors = torch.Tensor(len(TEXT.vocab), dim)
for i, token in enumerate(TEXT.vocab.itos):
wv_index = stoi.get(token, None)
if wv_index is not None:
TEXT.vocab.vectors[i] = vectors[wv_index]
match_embedding += 1
else:
TEXT.vocab.vectors[i] = torch.FloatTensor(dim).uniform_(-0.25, 0.25)
else:
print("Error: Need word embedding pt file")
:param val:
:param test:
:param skip_header:
:param save_vocab_path:
:param args:
:return:
"""
if os.path.exists(save_vocab_path) == False:
os.mkdir(save_vocab_path)
dataset_fields = []
for field in fields:
dataset_fields.append((field.name,field.field))
print(dataset_fields)
dataset = TabularDataset.splits(root_path,".data",train,val,test,fields=dataset_fields,skip_header=skip_header,format=format,**args)
for f_input in fields:
name = f_input.name
field = f_input.field
vocab = f_input.vocab
if vocab is None:
#verify if working properly
field.build_vocab(*dataset,max_size=f_input.max_size, min_freq=f_input.min_freq,
vectors=f_input.vectors, unk_init=f_input.unk_init, vectors_cache=f_input.vectors_cache)
with open(os.path.join(save_vocab_path,"{}.json".format(name)), "w") as jfile:
json.dump(field.vocab.stoi,jfile,sort_keys=True)
else:
with open(vocab, "r") as jfile:
def sst_word_char(path, word_field, char_field, label_field, batch_size, device, word_emb_file, char_emb_file, cache_dir):
fields = {
'text': [('text_word', word_field), ('text_char', char_field)],
'label': ('label', label_field)
}
train, dev, test = data.TabularDataset.splits(
path=path, train='train.jsonl', validation='dev.jsonl',
test='test.jsonl', format='json', skip_header=True,
fields=fields)
word_vectors = vocab.Vectors(word_emb_file, cache_dir)
char_vectors = vocab.Vectors(char_emb_file, cache_dir)
word_field.build_vocab(
train, dev, test, max_size=25000,
vectors=word_vectors, unk_init=torch.Tensor.normal_)
char_field.build_vocab(
train, dev, test, max_size=94,
vectors=char_vectors, unk_init=torch.Tensor.normal_)
label_field.build_vocab(train, dev, test)
def _create_loaders(path, traintsv, valtsv):
def parse_int(tok, *args):
return int(tok)
quesid = data.Field(sequential=False, use_vocab=False, postprocessing=data.Pipeline(parse_int))
ques = data.Field(include_lengths=True)
imgid = data.Field(sequential=False, use_vocab=False, postprocessing=data.Pipeline(parse_int))
ans = data.Field(sequential=False, use_vocab=False, postprocessing=data.Pipeline(parse_int))
train_data, val_data = data.TabularDataset.splits(path=path, train=traintsv, validation=valtsv,
fields=[('quesid', quesid), ('ques', ques), ('imgid', imgid), ('ans', ans)],
format='tsv')
batch_sizes = (1, 1)
train_loader, val_loader = data.BucketIterator.splits((train_data, val_data), batch_sizes=batch_sizes, repeat=False, sort_key=lambda x: len(x.ques))
ques.build_vocab(train_data)
print('vocabulary size: {}'.format(len(ques.vocab.stoi)))
return ques, train_loader, val_loader
'question': [('q_word', self.WORD), ('q_char', self.CHAR)]}
list_fields = [('id', self.RAW), ('s_idx', self.LABEL), ('e_idx', self.LABEL),
('c_word', self.WORD), ('c_char', self.CHAR),
('q_word', self.WORD), ('q_char', self.CHAR)]
if os.path.exists(dataset_path):
print("loading splits...")
train_examples = torch.load(train_examples_path)
dev_examples = torch.load(dev_examples_path)
self.train = data.Dataset(examples=train_examples, fields=list_fields)
self.dev = data.Dataset(examples=dev_examples, fields=list_fields)
else:
print("building splits...")
self.train, self.dev = data.TabularDataset.splits(
path=path,
train=f'{args.train_file}l',
validation=f'{args.dev_file}l',
format='json',
fields=dict_fields)
os.makedirs(dataset_path)
torch.save(self.train.examples, train_examples_path)
torch.save(self.dev.examples, dev_examples_path)
#cut too long context in the training set for efficiency.
if args.context_threshold > 0:
self.train.examples = [e for e in self.train.examples if len(e.c_word) <= args.context_threshold]
print("building vocab...")
self.CHAR.build_vocab(self.train, self.dev)
def load_datasets():
text = data.Field(include_lengths=True)
tags = data.Field()
train_data, val_data, test_data = data.TabularDataset.splits(path='RNN_Data_files/', train='train_data.tsv', validation='val_data.tsv', test='val_data.tsv', fields=[('text', text), ('tags', tags)], format='tsv')
batch_sizes = (args.batch_size, args.batch_size, args.batch_size)
train_loader, val_loader, test_loader = data.BucketIterator.splits((train_data, val_data, test_data), batch_sizes=batch_sizes, sort_key=lambda x: len(x.text))
text.build_vocab(train_data)
tags.build_vocab(train_data)
dataloaders = {'train': train_loader,
'validation': val_loader,
'test': val_loader}
return text, tags, dataloaders