How to use the gensim.models.keyedvectors.KeyedVectors.load_word2vec_format function in gensim

To help you get started, we’ve selected a few gensim examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github sjyk / alphaclean / alphaclean / search.py View on Github external
def loadWord2Vec(filename):
    """Loads a word2vec model from a file"""
    from gensim.models.keyedvectors import KeyedVectors
    return KeyedVectors.load_word2vec_format(filename, binary=True)
github radiodee1 / awesome-chatbot / model / babi_iii.py View on Github external
def task_set_embedding_matrix(self):
        print('stage: set_embedding_matrix')
        glove_data = hparams['data_dir'] + hparams['embed_name']
        from gensim.models.keyedvectors import KeyedVectors
        import numpy as np

        embed_size = int(hparams['embed_size'])

        embeddings_index = {}
        if not os.path.isfile(glove_data) :
            self.embedding_matrix = None  # np.zeros((len(self.vocab_list),embed_size))
            #self.trainable = True
        else:
            # load embedding
            glove_model = KeyedVectors.load_word2vec_format(glove_data, binary=False)

            f = open(glove_data)
            for line in range(self.output_lang.n_words): #len(self.vocab_list)):
                #if line == 0: continue
                word = self.output_lang.index2word[line]
                # print(word, line)
                if word in glove_model.wv.vocab:
                    #print('fill with values',line)
                    values = glove_model.wv[word]
                    value = np.asarray(values, dtype='float32')
                    embeddings_index[word] = value
                else:
                    print('fill with random values',line, word)
                    value = np.random.uniform(low=self.uniform_low, high=self.uniform_high, size=(embed_size,))
                    # value = np.zeros((embed_size,))
                    embeddings_index[word] = value
github hoxmark / Deep_reinforcement_active_learning / models / rnn.py View on Github external
def load_word2vec(self):
        print("loading word2vec...")
        word_vectors = KeyedVectors.load_word2vec_format(
            "GoogleNews-vectors-negative300.bin", binary=True)

        wv_matrix = []
        for word in self.data["vocab"]:
            if word in word_vectors.vocab:
                wv_matrix.append(word_vectors.word_vec(word))
            else:
                wv_matrix.append(
                    np.random.uniform(-0.01, 0.01, 300).astype("float32"))

        # one for UNK and one for zero padding
        wv_matrix.append(np.random.uniform(-0.01, 0.01, 300).astype("float32"))
        wv_matrix.append(np.zeros(300).astype("float32"))
        wv_matrix = np.array(wv_matrix)
        return wv_matrix
github radiodee1 / awesome-chatbot / model / babi_recurrent_alt.py View on Github external
print('embedding already loaded.')
            return
            pass
        glove_data = hparams['data_dir'] + hparams['embed_name']
        from gensim.models.keyedvectors import KeyedVectors
        import numpy as np

        embed_size = int(hparams['embed_size'])

        embeddings_index = {}
        if not os.path.isfile(glove_data) :
            self.embedding_matrix = None  # np.zeros((len(self.vocab_list),embed_size))
            #self.trainable = True
        else:
            # load embedding
            glove_model = KeyedVectors.load_word2vec_format(glove_data, binary=False)

            f = open(glove_data)
            for line in range(self.output_lang.n_words): #len(self.vocab_list)):
                #if line == 0: continue
                word = self.output_lang.index2word[line]
                # print(word, line)
                if word in glove_model.wv.vocab:
                    #print('fill with values',line)
                    values = glove_model.wv[word]
                    value = np.asarray(values, dtype='float32')
                    embeddings_index[word] = value
                else:
                    print('fill with random values',line, word)
                    value = np.random.uniform(low=self.uniform_low, high=self.uniform_high, size=(embed_size,))
                    # value = np.zeros((embed_size,))
                    embeddings_index[word] = value
github glnmario / emo2vec / lp_crossval.py View on Github external
def build_T(model_path, sigma):
    """
    :param model_path: the path of the final model
    :return: (T matrix, word-to-index dictionary for T)
    """
    emo2vec = KeyedVectors.load_word2vec_format(model_path, binary=False)  # .init_sims(replace=True)
    idx2word = dict(enumerate(emo2vec.index2word))
    n = 100#len(idx2word)

    # invert idx -> word mapping
    # word2idx = {w: i for i, w in idx2word.items()}
    word2idx = {w: i for i, w in {i: idx2word[i] for i in np.arange(n)}.items()}

    t = np.empty((n, n), dtype='float16')

    for w1, i in word2idx.items():
        for w2, j in word2idx.items():
            t[i, j] = to_cosine_dist(emo2vec.similarity(w1, w2)) ** 2

    t /= sigma ** 2
    t = np.exp(-t)
    t = normalize(t, axis=0, norm='l1', copy=False)
github radiodee1 / awesome-chatbot / model / babi_ii.py View on Github external
def task_set_embedding_matrix(self):
        print('stage: set_embedding_matrix')
        glove_data = hparams['data_dir'] + hparams['embed_name']
        from gensim.models.keyedvectors import KeyedVectors
        import numpy as np

        embed_size = int(hparams['embed_size'])

        embeddings_index = {}
        if not os.path.isfile(glove_data) :
            self.embedding_matrix = None  # np.zeros((len(self.vocab_list),embed_size))
            #self.trainable = True
        else:
            # load embedding
            glove_model = KeyedVectors.load_word2vec_format(glove_data, binary=False)

            f = open(glove_data)
            for line in range(self.output_lang.n_words): #len(self.vocab_list)):
                #if line == 0: continue
                word = self.output_lang.index2word[line]
                # print(word, line)
                if word in glove_model.wv.vocab:
                    #print('fill with values',line)
                    values = glove_model.wv[word]
                    value = np.asarray(values, dtype='float32')
                    embeddings_index[word] = value
                else:
                    print('fill with random values',line, word)
                    value = np.random.uniform(low=self.uniform_low, high=self.uniform_high, size=(embed_size,))
                    # value = np.zeros((embed_size,))
                    embeddings_index[word] = value
github radiodee1 / awesome-chatbot / model / seq_2_seq.py View on Github external
print('embedding already loaded.')
            return
            pass
        glove_data = hparams['data_dir'] + hparams['embed_name']
        from gensim.models.keyedvectors import KeyedVectors
        import numpy as np

        embed_size = int(hparams['embed_size'])

        embeddings_index = {}
        if not os.path.isfile(glove_data) :
            self.embedding_matrix = None  # np.zeros((len(self.vocab_list),embed_size))
            #self.trainable = True
        else:
            # load embedding
            glove_model = KeyedVectors.load_word2vec_format(glove_data, binary=False)

            f = open(glove_data)
            for line in range(self.output_lang.n_words): #len(self.vocab_list)):
                #if line == 0: continue
                word = self.output_lang.index2word[line]
                # print(word, line)
                if word in glove_model.wv.vocab:
                    #print('fill with values',line)
                    values = glove_model.wv[word]
                    value = np.asarray(values, dtype='float32')
                    embeddings_index[word] = value
                else:
                    print('fill with random values',line, word)
                    value = np.random.uniform(low=self.uniform_low, high=self.uniform_high, size=(embed_size,))
                    # value = np.zeros((embed_size,))
                    embeddings_index[word] = value
github hoxmark / Deep_reinforcement_active_learning / reinforcement / utils.py View on Github external
def load_word2vec():
    print("loading word2vec...")
    word_vectors = KeyedVectors.load_word2vec_format(
        "{}/GoogleNews-vectors-negative300.bin".format(opt.data_path), binary=True)
    wv_matrix = []

    if opt.dataset == 'vse':
        for idx in sorted(dict.keys(opt.vocab.idx2word)):
            word = opt.vocab.idx2word[idx]
            if word in word_vectors.vocab:
                wv_matrix.append(word_vectors.word_vec(word))
            else:
                wv_matrix.append(
                    np.random.uniform(-0.01, 0.01, 300).astype("float32"))

    else:
        for word in data.vocab:
            if word in word_vectors.vocab:
                wv_matrix.append(word_vectors.word_vec(word))
github castorini / hedwig / sm_cnn / utils.py View on Github external
def cache_word_embeddings(word_embeddings_file, cache_file):
    if not word_embeddings_file.endswith('.gz'):
        logger.warning('WARNING: expecting a .gz file. Is the {} in the correct \
            format?'.format(word_embeddings_file))

    vocab_size, vec_dim = 0, 0

    if not os.path.exists(cache_file):
        # cache does not exist
        if not os.path.exists(os.path.dirname(cache_file)):
            # make cache folder if needed
            os.mkdir(os.path.dirname(cache_file))
        logger.info('caching the word embeddings in np.memmap format')
        wv = KeyedVectors.load_word2vec_format(word_embeddings_file, binary=True)
        # print len(wv.syn0), wv.syn0.shape
        # print len(wv.syn0norm) if wv.syn0norm else None
        fp = np.memmap(cache_file, dtype=np.double, mode='w+', shape=wv.syn0.shape)
        fp[:] = wv.syn0[:]
        with open(cache_file + '.vocab', 'w', encoding='utf-8') as f:
            logger.info('writing out vocab for {}'.format(word_embeddings_file))
            for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()):
                print(w, file=f)
        with open(cache_file + '.dimensions', 'w', encoding='utf-8') as f:
            logger.info('writing out dimensions for {}'.format(word_embeddings_file))
            print(wv.syn0.shape[0], wv.syn0.shape[1], file=f)
        vocab_size, vec_dim = wv.syn0.shape
        del fp, wv
        print('cached {} into {}'.format(word_embeddings_file, cache_file))

    return vocab_size, vec_dim
github Hironsan / anago / anago / models / keras_gensim_embeddings.py View on Github external
def convert_embeddings(glove_input_file, word2vec_output_file,
                       embeddings_path='embeddings.npz',
                       vocab_path='map.json'):
    """
    Generate embeddings from a batch of text
    :param embeddings_path: where to save the embeddings
    :param vocab_path: where to save the word-index map
    """
    glove2word2vec(glove_input_file, word2vec_output_file)
    model = KeyedVectors.load_word2vec_format(word2vec_output_file)
    weights = model.syn0
    np.save(open(embeddings_path, 'wb'), weights)

    vocab = dict([(k, v.index) for k, v in model.vocab.items()])
    with open(vocab_path, 'w') as f:
        f.write(json.dumps(vocab))