How to use the gensim.models function in gensim

To help you get started, we’ve selected a few gensim examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github fredriko / metacurate-lexicon / src / scripts / train_wordspace_models.py View on Github external
def train_word2vec_model(input: str, output_directory: str, model_name: str) -> None:

    if not os.access(output_directory, os.W_OK):
        print("Cannot write to directory {}. Exiting!".format(output_directory))
        exit(1)

    if os.path.isdir(input):
        sentences = gensim.models.word2vec.PathLineSentences(input)
    else:
        sentences = gensim.models.word2vec.LineSentence(input)

    model = gensim.models.Word2Vec(sentences, sg=0, size=100, window=10, min_count=20, workers=10)
    model.train(sentences, total_examples=model.corpus_count, epochs=10)
    model.save(output_directory + model_name)
    # We want the vectors only to reduce memory footprint: this is the file(s) that the oneline lexicon should use.
    vectors = model.wv
    vectors.save(output_directory + model_name + ".vectors-only")
github ziqizhang / chase / python / src / ml / classifier_dnn.py View on Github external
else:
        params["scoreperclass"] = True
    if "word_norm" not in params.keys():
        params["word_norm"] = 1
    if "oov_random" not in params.keys():
        params["oov_random"] = 0
    if "emb_model" in params.keys():
        emb_models = []
        print("===> use pre-trained embeddings...")
        model_str = params["emb_model"].split(',')
        for m_s in model_str:
            gensimFormat = ".gensim" in m_s
            if gensimFormat:
                emb_models.append(gensim.models.KeyedVectors.load(m_s, mmap='r'))
            else:
                emb_models.append(gensim.models.KeyedVectors. \
                                  load_word2vec_format(m_s, binary=True))
        print("<===loaded {} models".format(len(emb_models)))
    if "emb_dim" in params.keys():
        emb_dim = int(params["emb_dim"])
    if "gpu" in params.keys():
        if params["gpu"] == "1":
            print("using gpu...")
        else:
            print("using cpu...")
    if "wdist" in params.keys():
        wdist_file = params["wdist"]
    else:
        wdist_file = None


    use_mixed_data=False
github svakulenk0 / semantic_coherence / load_embeddings.py View on Github external
def load_embeddings_gensim(embeddings_config, label, vocabulary, save_to):
    # create a weight matrix for entities in training docs
    embedding_matrix = np.zeros((len(vocabulary), embeddings_config['dims']))
        
    # load embeddings binary model with gensim for word2vec and rdf2vec embeddings
    model = gensim.models.Word2Vec.load(embeddings_config['path'])
    #model = gensim.models.KeyedVectors.load_word2vec_format(embeddings_config['path'], binary=True)
    embedded_entities = model.wv
    missing = 0
    for entity, entity_id in vocabulary.items():
        # strip entity label format to rdf2vec label format
        #rdf2vec_entity_label = 'dbr:%s' % entity.split('/')[-1]
        #print rdf2vec_entity_label
        rdf2vec_entity_label = '<' + entity + '>'
        if rdf2vec_entity_label in embedded_entities:
            embedding_matrix[entity_id] = embedded_entities[rdf2vec_entity_label]
        else:
            missing += 1
    print "done loading gensim entities. %d missing" % missing
    # save embedding_matrix for entities in the training dataset
    np.save(save_to, embedding_matrix)
    # print embedding_matrix
github ChenglongChen / Kaggle_HomeDepot / Code / Igor&Kostia / word2vec.py View on Github external
print "third vocab"   

#st conc pt conc pd conc br conc mr vocab w/o pars
t3 = list()
for i in range(len(st)):
    p = st1[i].split()+pt1[i].split()+pd1[i].split()+br1[i].split()+mr1[i].split()+ab1[i].split()+at1[i].split()
    t3.append(p)

print "fourth vocab" 

#trin models
model0 = gensim.models.Word2Vec(t, sg=1, window=10, sample=1e-5, negative=5, size=300)
model1 = gensim.models.Word2Vec(t1, sg=1, window=10, sample=1e-5, negative=5, size=300)
model2 = gensim.models.Word2Vec(t2, sg=1, window=10, sample=1e-5, negative=5, size=300)
model3 = gensim.models.Word2Vec(t3, sg=1, window=10, sample=1e-5, negative=5, size=300)
#model4 = gensim.models.Word2Vec(t, sg=0,  hs=1, window=10,   size=300)
#model5 = gensim.models.Word2Vec(t1, sg=0, hs=1,window=10,   size=300)
#model6 = gensim.models.Word2Vec(t2, sg=0, hs=1, window=10,   size=300)
#model7 = gensim.models.Word2Vec(t3, sg=0, hs=1,window=10,   size=300)

print "model prepared"


#for each model calculate features^ n_similarity between st and something else
model_list=[model0,model1,model2,model3]   #,model4  ,model5,model6,model7]
n_sim=list()

for model in model_list:

    n_sim_pt=list()
    for i in range(len(st)):
github diegma / relation-autoencoder / learning / models / decoders / BilinearPlusSP.py View on Github external
CNP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, k, r)), dtype=theano.config.floatX)
                                                                                            # @UndefinedVariable
        self.C = theano.shared(value=CNP, name='C')
        # self.C = theano.printing.Print("C = ")(self.C)

        # Selectional Preferences
        Ca1NP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, r)), dtype=theano.config.floatX)
        Ca2NP = np.asarray(rng.normal(0, math.sqrt(0.1), size=(k, r)), dtype=theano.config.floatX)
        self.C1 = theano.shared(value=Ca1NP, name='C1')
        self.C2 = theano.shared(value=Ca2NP, name='C2')
        # argument embeddings
        ANP = np.asarray(rng.uniform(-0.01, 0.01, size=(a, k)), dtype=theano.config.floatX)  # @UndefinedVariable

        if ex_emb:
            import gensim
            external_embeddings = gensim.models.Word2Vec.load(settings.external_embeddings_path)

            for idArg in xrange(self.a):
                arg = data.id2Arg[idArg].lower().split(' ')
                new = np.zeros(k, dtype=theano.config.floatX)
                size = 0
                for ar in arg:
                    if ar in external_embeddings:
                        new += external_embeddings[ar]
                        size += 1
                if size > 0:
                    ANP[idArg] = new/size

        self.A = theano.shared(value=ANP, name='A')  # (a1, k)

        self.Ab = theano.shared(value=np.zeros(a,  dtype=theano.config.floatX),  # @UndefinedVariable
                                 name='Ab', borrow=True)
github biolab / orange3-text / orangecontrib / text / topics / lsi.py View on Github external
from gensim import models

from .topics import GensimWrapper

models.LsiModel.update = models.LsiModel.add_documents
models.LsiModel.add_documents = lambda self, *args, **kwargs: self.update(*args, **kwargs)


class LsiWrapper(GensimWrapper):
    name = 'Latent Semantic Indexing'
    Model = models.LsiModel
    has_negative_weights = True
github eellak / gsoc2018-3gm / 3gm / codifier.py View on Github external
'workers': max(
                1,
                multiprocessing.cpu_count() -
                1),
            'sample': 1E-3,
        }

        all_sentences = []

        for law in self.laws.values():
            for article in law.sentences.keys():
                for par in law.sentences[article]:
                    for per in law.sentences[article][par]:
                        all_sentences.append(per)

        self.model = gensim.models.Word2Vec(all_sentences, **params)
        print('Model train complete!')
        self.model.wv.save_word2vec_format('model')

        return self.model
github houchengbin / DynWalks / src / libne / DynWalks.py View on Github external
def sampling_traning(self):
          # SGNS and suggested parameters to be tuned: size, window, negative, workers, seed
          # to tune other parameters, please read https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
          w2v = gensim.models.Word2Vec(sentences=None, size=self.emb_dim, window=self.window, sg=1, hs=0, negative=self.negative, ns_exponent=0.75,
                              alpha=0.025, min_alpha=0.0001, min_count=1, sample=0.001, iter=4, workers=self.workers, seed=self.seed,
                              corpus_file=None, sorted_vocab=1, batch_words=10000, compute_loss=False,
                              max_vocab_size=None, max_final_vocab=None, trim_rule=None)  # w2v constructor, default parameters
     
          for t in range(len(self.G_dynamic)):
               t1 = time.time()
               if t == 0:   # offline ==============================
                    G0 = self.G_dynamic[t]    # initial graph
                    sentences = simulate_walks(nx_graph=G0, num_walks=self.num_walks, walk_length=self.walk_length)
                    sentences = [[str(j) for j in i] for i in sentences]
                    w2v.build_vocab(sentences=sentences, update=False) # init traning, so update False
                    w2v.train(sentences=sentences, total_examples=w2v.corpus_count, epochs=w2v.iter) # follow w2v constructor

               else:   # incremental adapting ==============================
                    G0 = self.G_dynamic[t-1]  # previous graph
                    G1 = self.G_dynamic[t]    # current graph
github jiangxinyang227 / NLP-Project / text_generator_raw / data_helpers / train_data.py View on Github external
def get_word_vectors(self, vocab):
        """
        加载词向量,并获得相应的词向量矩阵
        :param vocab: 训练集所含有的单词
        :return:
        """
        word_vectors = (1 / np.sqrt(len(vocab)) * (2 * np.random.rand(len(vocab), self._embedding_size) - 1))
        if os.path.splitext(self._word_vectors_path)[-1] == ".bin":
            word_vec = gensim.models.KeyedVectors.load_word2vec_format(self._word_vectors_path, binary=True)
        else:
            word_vec = gensim.models.KeyedVectors.load_word2vec_format(self._word_vectors_path)

        for i in range(len(vocab)):
            try:
                vector = word_vec.wv[vocab[i]]
                word_vectors[i, :] = vector
            except:
                print(vocab[i] + "不存在于字向量中")

        return word_vectors