How to use the torchtext.vocab.GloVe function in torchtext

To help you get started, we’ve selected a few torchtext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pytorch / text / test / language_modeling.py View on Github external
from torchtext.vocab import GloVe

# Approach 1:
# set up fields
TEXT = data.Field(lower=True, batch_first=True)

# make splits for data
train, valid, test = datasets.WikiText2.splits(TEXT)

# print information about the data
print('train.fields', train.fields)
print('len(train)', len(train))
print('vars(train[0])', vars(train[0])['text'][0:10])

# build the vocabulary
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))

# print vocab information
print('len(TEXT.vocab)', len(TEXT.vocab))

# make iterator for splits
train_iter, valid_iter, test_iter = data.BPTTIterator.splits(
    (train, valid, test), batch_size=3, bptt_len=30, device="cuda:0")

# print batch information
batch = next(iter(train_iter))
print(batch.text)
print(batch.target)

# Approach 2:
train_iter, valid_iter, test_iter = datasets.WikiText2.iters(batch_size=4, bptt_len=30)
github pytorch / text / test / imdb.py View on Github external
# Approach 1:
# set up fields
TEXT = data.Field(lower=True, include_lengths=True, batch_first=True)
LABEL = data.Field(sequential=False)


# make splits for data
train, test = datasets.IMDB.splits(TEXT, LABEL)

# print information about the data
print('train.fields', train.fields)
print('len(train)', len(train))
print('vars(train[0])', vars(train[0]))

# build the vocabulary
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
LABEL.build_vocab(train)

# print vocab information
print('len(TEXT.vocab)', len(TEXT.vocab))
print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())

# make iterator for splits
train_iter, test_iter = data.BucketIterator.splits(
    (train, test), batch_size=3, device="cuda:0")

# print batch information
batch = next(iter(train_iter))
print(batch.text)
print(batch.label)

# Approach 2:
github arthurmensch / didyprog / examples / ner / word_tagging.py View on Github external
init_idx = None

    if isinstance(pretrained_embeddings, int):
        sentences.build_vocab(train_data, val_data, test_data)
        embedding_dim = pretrained_embeddings
    else:
        if pretrained_embeddings == 'ner':
            vectors = CaseInsensitiveVectors(
                expanduser('~/data/sdtw_data/ner/%s' %
                           tagger_languages[language]),
                unk_init=lambda x: x.normal_(0, 1),
                cache=expanduser('~/cache'))
        elif 'glove' in pretrained_embeddings:
            _, name, dim = pretrained_embeddings.split('.')
            dim = dim[:-1]
            GloVe.__getitem__ = CaseInsensitiveVectors.__getitem__
            vectors = GloVe(name=name, dim=dim, cache=expanduser('~/cache'))
        elif pretrained_embeddings == 'fasttext':
            FastText.__getitem__ = CaseInsensitiveVectors.__getitem__
            FastText.cache = CaseInsensitiveVectors.cache
            vectors = FastText(language=language,
                               cache=expanduser('~/cache'))
        # extend vocab with words of test/val set that has embeddings in
        # pre-trained embedding
        # A prod-version would do it dynamically at inference time
        counter = Counter()
        sentences.build_vocab(val_data, test_data)
        for word in sentences.vocab.stoi:
            if word in vectors.stoi or word.lower() in vectors.stoi or \
                    re.sub('\d', '0', word.lower()) in vectors.stoi:
                counter[word] = 1
        eval_vocab = Vocab(counter)
github prezaei85 / nl2sql / src / table / ModelConstructor.py View on Github external
def make_word_embeddings(opt, word_dict, fields):
    word_padding_idx = word_dict.stoi[table.IO.PAD_WORD]
    num_word = len(word_dict)
    emb_word = nn.Embedding(num_word, opt.word_vec_size,
                            padding_idx=word_padding_idx)
    if len(opt.pre_word_vecs) > 0:
        vectors = torchtext.vocab.GloVe(
            name="840B", cache=opt.pre_word_vecs, dim=str(opt.word_vec_size))
        fields["src"].vocab.load_vectors(vectors)
        emb_word.weight.data.copy_(fields["src"].vocab.vectors)

    if opt.fix_word_vecs:
        #  is 0
        num_special = len(table.IO.special_token_list)
        # zero vectors in the fixed embedding (emb_word)
        emb_word.weight.data[:num_special].zero_()
        emb_special = nn.Embedding(
            num_special, opt.word_vec_size, padding_idx=word_padding_idx)
        emb = PartUpdateEmbedding(num_special, emb_special, emb_word)
        return emb
    else:
        return emb_word
github SeojinBang / VIBI / return_data.py View on Github external
elif name in ['imdb', 'IMDB']:

        embedding_dim = 100
        max_total_num_words = 20000
        text = data.Field(tokenize = tokenizer_twolevel, 
                          batch_first = True)
        label = data.Field(lower = True)
        label_pred = data.Field(use_vocab = False, fix_length = 1)
        fname = data.Field(use_vocab = False, fix_length = 1)
        
        train, valid, test = IMDB_modified.splits(text, label, label_pred, fname,
                                                  root = root, model_name = args.model_name,
                                                  load_pred = args.load_pred)
        print("build vocab...")
        text.build_vocab(train, vectors = GloVe(name = '6B',
                                                dim = embedding_dim,
                                                cache = root), max_size = max_total_num_words)
        label.build_vocab(train)
        
        print("Create Iterator objects for multiple splits of a dataset...")
        train_loader, valid_loader, test_loader = data.Iterator.splits((train, valid, test),
                                                                       batch_size = batch_size,
                                                                       device = device,
                                                                       repeat = False)
        
        data_loader['word_idx'] = text.vocab.itos
        data_loader['x_type'] = torch.cuda.LongTensor if args.cuda else torch.LongTensor
        data_loader['y_type'] = torch.cuda.LongTensor if args.cuda else torch.LongTensor
        data_loader['max_total_num_words'] = max_total_num_words
        data_loader['embedding_dim'] = embedding_dim
        data_loader['max_num_words'] = 50
github luulinh90s / Neural-Baby-Talk-python3 / misc / dataloader_flickr30k.py View on Github external
self.seq_length = opt.seq_length
        self.split = split
        self.seq_per_img = seq_per_img
        # image processing function.
        if split == 'train':
            self.Resize = transforms.Resize((self.opt.image_size, self.opt.image_size))
        else:
            self.Resize = transforms.Resize((self.opt.image_crop_size, self.opt.image_crop_size))
        self.RandomCropWithBbox = utils.RandomCropWithBbox(opt.image_crop_size)
        self.ToTensor = transforms.ToTensor()
        self.res_Normalize = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        self.vgg_pixel_mean = np.array([[[102.9801, 115.9465, 122.7717]]])

        self.max_gt_box = 100
        self.max_proposal = 200
        self.glove = vocab.GloVe(name='6B', dim=300)

        # load the json file which contains additional information about the dataset
        print('DataLoader loading json file: ', opt.input_dic)
        self.info = json.load(open(self.opt.input_dic))
        self.itow = self.info['ix_to_word']
        self.wtoi = {w:i for i,w in self.itow.items()}
        self.wtod = {w:i+1 for w,i in self.info['wtod'].items()} # word to detection
        self.dtoi = self.wtod # detection to index
        self.itod = {i:w for w,i in self.dtoi.items()}
        self.wtol = self.info['wtol']
        self.ltow = {l:w for w,l in self.wtol.items()}
        self.vocab_size = len(self.itow) + 1 # since it start from 1
        print('vocab size is ', self.vocab_size)
        self.itoc = self.itod

        # initilize the fg+s/p map back to word idx.
github wabyking / TextClassificationBenchmark / dataloader / torch_text_demo / sst.py View on Github external
# print vocab information
print('len(TEXT.vocab)', len(TEXT.vocab))
print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())

# make iterator for splits
train_iter, val_iter, test_iter = data.BucketIterator.splits(
    (train, val, test), batch_size=3, device=0)

# print batch information
batch = next(iter(train_iter))
print(batch.text)
print(batch.label)

# Approach 2:
TEXT.build_vocab(train, vectors=[GloVe(name='840B', dim='300'), CharNGram(), FastText()])
LABEL.build_vocab(train)

# print vocab information
print('len(TEXT.vocab)', len(TEXT.vocab))
print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())

train_iter, val_iter, test_iter = datasets.SST.iters(batch_size=4)

# print batch information
batch = next(iter(train_iter))
print(batch.text)
print(batch.label)

# Approach 3:
f = FastText()
TEXT.build_vocab(train, vectors=f)
github facebookresearch / grounded-video-description / misc / dataloader_anet.py View on Github external
self.seq_length = opt.seq_length
        self.split = split
        self.seq_per_img = seq_per_img
        self.att_feat_size = opt.att_feat_size
        self.vis_attn = opt.vis_attn
        self.feature_root = opt.feature_root
        self.seg_feature_root = opt.seg_feature_root
        self.num_sampled_frm = opt.num_sampled_frm
        self.num_prop_per_frm = opt.num_prop_per_frm
        self.exclude_bgd_det = opt.exclude_bgd_det
        self.prop_thresh = opt.prop_thresh
        self.t_attn_size = opt.t_attn_size
        self.test_mode = opt.test_mode
        self.max_gt_box = 100
        self.max_proposal = self.num_sampled_frm * self.num_prop_per_frm
        self.glove = vocab.GloVe(name='6B', dim=300)

        # load the json file which contains additional information about the dataset
        print('DataLoader loading json file: ', opt.input_dic)
        self.info = json.load(open(self.opt.input_dic))
        self.itow = self.info['ix_to_word']
        self.wtoi = {w:i for i,w in self.itow.items()}
        self.wtod = {w:i+1 for w,i in self.info['wtod'].items()} # word to detection
        self.dtoi = self.wtod # detection to index
        self.itod = {i:w for w,i in self.dtoi.items()}
        self.wtol = self.info['wtol']
        self.ltow = {l:w for w,l in self.wtol.items()}
        self.vocab_size = len(self.itow) + 1 # since it start from 1
        print('vocab size is ', self.vocab_size)
        self.itoc = self.itod

        # get the glove vector for the vg detection cls
github plkmo / NLP_Toolkit / nlptoolkit / style_transfer / data.py View on Github external
dataset_fn = lambda name: data.TabularDataset(
        path=root + name,
        format='tsv',
        fields=[('text', TEXT)]
    )

    train_pos_set, train_neg_set = map(dataset_fn, [train_pos, train_neg])
    dev_pos_set, dev_neg_set = map(dataset_fn, [dev_pos, dev_neg])
    test_pos_set, test_neg_set = map(dataset_fn, [test_pos, test_neg])

    TEXT.build_vocab(train_pos_set, train_neg_set, min_freq=config.min_freq)

    if config.load_pretrained_embed:
        start = time.time()
        
        vectors=torchtext.vocab.GloVe('6B', dim=config.embed_size, cache=config.pretrained_embed_path)
        TEXT.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim)
        print('vectors', TEXT.vocab.vectors.size())
        
        print('load embedding took {:.2f} s.'.format(time.time() - start))

    vocab = TEXT.vocab
        
    dataiter_fn = lambda dataset, train: data.BucketIterator(
        dataset=dataset,
        batch_size=config.batch_size,
        shuffle=train,
        repeat=train,
        sort_key=lambda x: len(x.text),
        sort_within_batch=False,
        device=config.device
    )
github facebookresearch / ParlAI / parlai / core / torch_agent.py View on Github external
# set up preinitialized embeddings
        try:
            import torchtext.vocab as vocab
        except ImportError as ex:
            print('Please install torch text with `pip install torchtext`')
            raise ex
        pretrained_dim = 300
        if emb_type.startswith('glove'):
            if 'twitter' in emb_type:
                init = 'glove-twitter'
                name = 'twitter.27B'
                pretrained_dim = 200
            else:
                init = 'glove'
                name = '840B'
            embs = vocab.GloVe(
                name=name,
                dim=pretrained_dim,
                cache=modelzoo_path(self.opt.get('datapath'), 'zoo:glove_vectors'),
            )
        elif emb_type.startswith('fasttext_cc'):
            init = 'fasttext_cc'
            from parlai.zoo.fasttext_cc_vectors.build import download

            embs = download(self.opt.get('datapath'))
        elif emb_type.startswith('fasttext'):
            init = 'fasttext'
            from parlai.zoo.fasttext_vectors.build import download

            embs = download(self.opt.get('datapath'))
        else:
            raise RuntimeError(