Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
from torchtext.vocab import GloVe
# Approach 1:
# set up fields
TEXT = data.Field(lower=True, batch_first=True)
# make splits for data
train, valid, test = datasets.WikiText2.splits(TEXT)
# print information about the data
print('train.fields', train.fields)
print('len(train)', len(train))
print('vars(train[0])', vars(train[0])['text'][0:10])
# build the vocabulary
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
# print vocab information
print('len(TEXT.vocab)', len(TEXT.vocab))
# make iterator for splits
train_iter, valid_iter, test_iter = data.BPTTIterator.splits(
(train, valid, test), batch_size=3, bptt_len=30, device="cuda:0")
# print batch information
batch = next(iter(train_iter))
print(batch.text)
print(batch.target)
# Approach 2:
train_iter, valid_iter, test_iter = datasets.WikiText2.iters(batch_size=4, bptt_len=30)
# Approach 1:
# set up fields
TEXT = data.Field(lower=True, include_lengths=True, batch_first=True)
LABEL = data.Field(sequential=False)
# make splits for data
train, test = datasets.IMDB.splits(TEXT, LABEL)
# print information about the data
print('train.fields', train.fields)
print('len(train)', len(train))
print('vars(train[0])', vars(train[0]))
# build the vocabulary
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
LABEL.build_vocab(train)
# print vocab information
print('len(TEXT.vocab)', len(TEXT.vocab))
print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())
# make iterator for splits
train_iter, test_iter = data.BucketIterator.splits(
(train, test), batch_size=3, device="cuda:0")
# print batch information
batch = next(iter(train_iter))
print(batch.text)
print(batch.label)
# Approach 2:
init_idx = None
if isinstance(pretrained_embeddings, int):
sentences.build_vocab(train_data, val_data, test_data)
embedding_dim = pretrained_embeddings
else:
if pretrained_embeddings == 'ner':
vectors = CaseInsensitiveVectors(
expanduser('~/data/sdtw_data/ner/%s' %
tagger_languages[language]),
unk_init=lambda x: x.normal_(0, 1),
cache=expanduser('~/cache'))
elif 'glove' in pretrained_embeddings:
_, name, dim = pretrained_embeddings.split('.')
dim = dim[:-1]
GloVe.__getitem__ = CaseInsensitiveVectors.__getitem__
vectors = GloVe(name=name, dim=dim, cache=expanduser('~/cache'))
elif pretrained_embeddings == 'fasttext':
FastText.__getitem__ = CaseInsensitiveVectors.__getitem__
FastText.cache = CaseInsensitiveVectors.cache
vectors = FastText(language=language,
cache=expanduser('~/cache'))
# extend vocab with words of test/val set that has embeddings in
# pre-trained embedding
# A prod-version would do it dynamically at inference time
counter = Counter()
sentences.build_vocab(val_data, test_data)
for word in sentences.vocab.stoi:
if word in vectors.stoi or word.lower() in vectors.stoi or \
re.sub('\d', '0', word.lower()) in vectors.stoi:
counter[word] = 1
eval_vocab = Vocab(counter)
def make_word_embeddings(opt, word_dict, fields):
word_padding_idx = word_dict.stoi[table.IO.PAD_WORD]
num_word = len(word_dict)
emb_word = nn.Embedding(num_word, opt.word_vec_size,
padding_idx=word_padding_idx)
if len(opt.pre_word_vecs) > 0:
vectors = torchtext.vocab.GloVe(
name="840B", cache=opt.pre_word_vecs, dim=str(opt.word_vec_size))
fields["src"].vocab.load_vectors(vectors)
emb_word.weight.data.copy_(fields["src"].vocab.vectors)
if opt.fix_word_vecs:
# is 0
num_special = len(table.IO.special_token_list)
# zero vectors in the fixed embedding (emb_word)
emb_word.weight.data[:num_special].zero_()
emb_special = nn.Embedding(
num_special, opt.word_vec_size, padding_idx=word_padding_idx)
emb = PartUpdateEmbedding(num_special, emb_special, emb_word)
return emb
else:
return emb_word
elif name in ['imdb', 'IMDB']:
embedding_dim = 100
max_total_num_words = 20000
text = data.Field(tokenize = tokenizer_twolevel,
batch_first = True)
label = data.Field(lower = True)
label_pred = data.Field(use_vocab = False, fix_length = 1)
fname = data.Field(use_vocab = False, fix_length = 1)
train, valid, test = IMDB_modified.splits(text, label, label_pred, fname,
root = root, model_name = args.model_name,
load_pred = args.load_pred)
print("build vocab...")
text.build_vocab(train, vectors = GloVe(name = '6B',
dim = embedding_dim,
cache = root), max_size = max_total_num_words)
label.build_vocab(train)
print("Create Iterator objects for multiple splits of a dataset...")
train_loader, valid_loader, test_loader = data.Iterator.splits((train, valid, test),
batch_size = batch_size,
device = device,
repeat = False)
data_loader['word_idx'] = text.vocab.itos
data_loader['x_type'] = torch.cuda.LongTensor if args.cuda else torch.LongTensor
data_loader['y_type'] = torch.cuda.LongTensor if args.cuda else torch.LongTensor
data_loader['max_total_num_words'] = max_total_num_words
data_loader['embedding_dim'] = embedding_dim
data_loader['max_num_words'] = 50
self.seq_length = opt.seq_length
self.split = split
self.seq_per_img = seq_per_img
# image processing function.
if split == 'train':
self.Resize = transforms.Resize((self.opt.image_size, self.opt.image_size))
else:
self.Resize = transforms.Resize((self.opt.image_crop_size, self.opt.image_crop_size))
self.RandomCropWithBbox = utils.RandomCropWithBbox(opt.image_crop_size)
self.ToTensor = transforms.ToTensor()
self.res_Normalize = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
self.vgg_pixel_mean = np.array([[[102.9801, 115.9465, 122.7717]]])
self.max_gt_box = 100
self.max_proposal = 200
self.glove = vocab.GloVe(name='6B', dim=300)
# load the json file which contains additional information about the dataset
print('DataLoader loading json file: ', opt.input_dic)
self.info = json.load(open(self.opt.input_dic))
self.itow = self.info['ix_to_word']
self.wtoi = {w:i for i,w in self.itow.items()}
self.wtod = {w:i+1 for w,i in self.info['wtod'].items()} # word to detection
self.dtoi = self.wtod # detection to index
self.itod = {i:w for w,i in self.dtoi.items()}
self.wtol = self.info['wtol']
self.ltow = {l:w for w,l in self.wtol.items()}
self.vocab_size = len(self.itow) + 1 # since it start from 1
print('vocab size is ', self.vocab_size)
self.itoc = self.itod
# initilize the fg+s/p map back to word idx.
# print vocab information
print('len(TEXT.vocab)', len(TEXT.vocab))
print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())
# make iterator for splits
train_iter, val_iter, test_iter = data.BucketIterator.splits(
(train, val, test), batch_size=3, device=0)
# print batch information
batch = next(iter(train_iter))
print(batch.text)
print(batch.label)
# Approach 2:
TEXT.build_vocab(train, vectors=[GloVe(name='840B', dim='300'), CharNGram(), FastText()])
LABEL.build_vocab(train)
# print vocab information
print('len(TEXT.vocab)', len(TEXT.vocab))
print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())
train_iter, val_iter, test_iter = datasets.SST.iters(batch_size=4)
# print batch information
batch = next(iter(train_iter))
print(batch.text)
print(batch.label)
# Approach 3:
f = FastText()
TEXT.build_vocab(train, vectors=f)
self.seq_length = opt.seq_length
self.split = split
self.seq_per_img = seq_per_img
self.att_feat_size = opt.att_feat_size
self.vis_attn = opt.vis_attn
self.feature_root = opt.feature_root
self.seg_feature_root = opt.seg_feature_root
self.num_sampled_frm = opt.num_sampled_frm
self.num_prop_per_frm = opt.num_prop_per_frm
self.exclude_bgd_det = opt.exclude_bgd_det
self.prop_thresh = opt.prop_thresh
self.t_attn_size = opt.t_attn_size
self.test_mode = opt.test_mode
self.max_gt_box = 100
self.max_proposal = self.num_sampled_frm * self.num_prop_per_frm
self.glove = vocab.GloVe(name='6B', dim=300)
# load the json file which contains additional information about the dataset
print('DataLoader loading json file: ', opt.input_dic)
self.info = json.load(open(self.opt.input_dic))
self.itow = self.info['ix_to_word']
self.wtoi = {w:i for i,w in self.itow.items()}
self.wtod = {w:i+1 for w,i in self.info['wtod'].items()} # word to detection
self.dtoi = self.wtod # detection to index
self.itod = {i:w for w,i in self.dtoi.items()}
self.wtol = self.info['wtol']
self.ltow = {l:w for w,l in self.wtol.items()}
self.vocab_size = len(self.itow) + 1 # since it start from 1
print('vocab size is ', self.vocab_size)
self.itoc = self.itod
# get the glove vector for the vg detection cls
dataset_fn = lambda name: data.TabularDataset(
path=root + name,
format='tsv',
fields=[('text', TEXT)]
)
train_pos_set, train_neg_set = map(dataset_fn, [train_pos, train_neg])
dev_pos_set, dev_neg_set = map(dataset_fn, [dev_pos, dev_neg])
test_pos_set, test_neg_set = map(dataset_fn, [test_pos, test_neg])
TEXT.build_vocab(train_pos_set, train_neg_set, min_freq=config.min_freq)
if config.load_pretrained_embed:
start = time.time()
vectors=torchtext.vocab.GloVe('6B', dim=config.embed_size, cache=config.pretrained_embed_path)
TEXT.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim)
print('vectors', TEXT.vocab.vectors.size())
print('load embedding took {:.2f} s.'.format(time.time() - start))
vocab = TEXT.vocab
dataiter_fn = lambda dataset, train: data.BucketIterator(
dataset=dataset,
batch_size=config.batch_size,
shuffle=train,
repeat=train,
sort_key=lambda x: len(x.text),
sort_within_batch=False,
device=config.device
)
# set up preinitialized embeddings
try:
import torchtext.vocab as vocab
except ImportError as ex:
print('Please install torch text with `pip install torchtext`')
raise ex
pretrained_dim = 300
if emb_type.startswith('glove'):
if 'twitter' in emb_type:
init = 'glove-twitter'
name = 'twitter.27B'
pretrained_dim = 200
else:
init = 'glove'
name = '840B'
embs = vocab.GloVe(
name=name,
dim=pretrained_dim,
cache=modelzoo_path(self.opt.get('datapath'), 'zoo:glove_vectors'),
)
elif emb_type.startswith('fasttext_cc'):
init = 'fasttext_cc'
from parlai.zoo.fasttext_cc_vectors.build import download
embs = download(self.opt.get('datapath'))
elif emb_type.startswith('fasttext'):
init = 'fasttext'
from parlai.zoo.fasttext_vectors.build import download
embs = download(self.opt.get('datapath'))
else:
raise RuntimeError(