How to use the gensim.models.Word2Vec function in gensim

To help you get started, we’ve selected a few gensim examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github akutuzov / webvectors / scripts / train_model.py View on Github external
argument = sys.argv[1]
filename = argument.split('/')[-1]

args = filename.split('.')[0].split('__')
(urlhash,algo,vectorsize,windowsize) = args

if algo == "skipgram":
    skipgram = 1
else:
    skipgram = 0

data = gensim.models.word2vec.LineSentence(argument)


model = gensim.models.Word2Vec(data, size=int(vectorsize), min_count=2, window=int(windowsize), sg=skipgram, workers=2, iter=5, cbow_mean=1)
model.init_sims(replace=True)
model.save_word2vec_format(root+'/trained/'+filename.split('.')[0].split('__')[0]+'.model', binary=True)
os.remove(root+'/tmp/'+filename.split('.')[0].split('__')[0])
github kemaswill / Allen_AI_Science_Challenge_JunweiPan / word2vec.py View on Github external
def train_model(lst_sentence, path_model, min_count_p = 5, workers_p = 4, size_p = 200, window_p = 5):
    model = gensim.models.Word2Vec(lst_sentence, min_count = min_count_p, workers = window_p, size = size_p, window = window_p, cbow_mean = 0)
    model.save(path_model)
'''
github STHSF / DeepNaturalLanguageProcessing / TextClassification / sentiment_analysis / sentiment_analysis_zh / word2vec_gensim_model.py View on Github external
def word2vec_model(data, size, min_c):
    w2c_model = Word2Vec(size=size, min_count=min_c, workers=cpu_count())
    w2c_model.build_vocab(data)
    w2c_model.train(data)

    return w2c_model
github idio / wiki2vec / resources / gensim / gensim_word2vec.py View on Github external
def read_corpus(path_to_corpus, output_path, min_count=10, size=500, window=10):
    workers = multiprocessing.cpu_count()
    sentences = gensim.models.word2vec.LineSentence(path_to_corpus)
    model = gensim.models.Word2Vec(sentences, min_count=min_count, size=size,
                                   window=window, sg=1, workers=workers)
    model.save(output_path)
github iwangjian / ByteCup2018 / train_word2vec.py View on Github external
def main(args):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    start = time()
    save_dir = args.path
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    sentences = Sentences(args.data)
    model = gensim.models.Word2Vec(size=args.dim, min_count=5, workers=16, sg=1)
    model.build_vocab(sentences)
    print('vocab built in {}'.format(timedelta(seconds=time()-start)))
    model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)

    model.save(os.path.join(save_dir, 'word2vec.{}d.{}k.bin'.format(args.dim, len(model.wv.vocab)//1000)))
    model.wv.save_word2vec_format(os.path.join(save_dir,
        'word2vec.{}d.{}k.w2v'.format(args.dim, len(model.wv.vocab)//1000)
    ))

    print('word2vec trained in {}'.format(timedelta(seconds=time()-start)))
github ethen8181 / machine-learning / deep_learning / word2vec / word2vec_workflow.py View on Github external
else:
        logger.info('training phrase model')
        # use LineSetence to stream text as oppose to loading it all into memory
        unigram_sentences = LineSentence(UNIGRAM_PATH)
        phrase_model = Phrases(unigram_sentences)
        phrase_model.save(PHRASE_MODEL_CHECKPOINT)

    if not os.path.exists(BIGRAM_PATH):
        logger.info('converting words to phrases')
        export_bigrams(UNIGRAM_PATH, BIGRAM_PATH, phrase_model)

    if os.path.exists(WORD2VEC_CHECKPOINT):
        word2vec = Word2Vec.load(WORD2VEC_CHECKPOINT)
    else:
        logger.info('training word2vec')
        word2vec = Word2Vec(corpus_file=BIGRAM_PATH, workers=cpu_count())
        word2vec.save(WORD2VEC_CHECKPOINT)

    logger.info('job completed')
github dataiku / dataiku-contrib / word2vec / custom-recipes / word2vec-transform / recipe.py View on Github external
def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0.
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    if agg_mean : 
        featureVec = np.divide(featureVec,nwords)
    return featureVec

if modelformat == "gensim":
    model = gensim.models.Word2Vec.load(modelfile)
elif modelformat == "word2vec-text":
    model = gensim.models.Word2Vec.load_word2vec_format(modelfile, binary=False)
elif modelformat == "word2vec-binary":
    model = gensim.models.Word2Vec.load_word2vec_format(modelfile, binary=True)
else:
    raise Exception("Unknown model format: %s" % modelformat)
index2word_set = set(model.wv.vocab)
word2vecdim = model.wv.syn0.shape[1]

if keep_all_cols :
    myschema = [val for val in input_text_dataset.read_schema()]
else :
    myschema = []

for i in range(word2vecdim) : 
    myschema.append({"name": "word2vec_" + str(i),"type": "float"})
output_text_dataset.write_schema(myschema)
mywriter = output_text_dataset.get_writer()
github manasRK / word2vec-recommender / recommender_context.py View on Github external
def train(model_file):
    contexts = ContextCorpus(data_obj)
    #model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=10, sg=1, size = 300, sample=1e-3, hs=1, window = 5) #a1 
    #model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=3, sg=0, size = 300, sample=1e-5, hs=0, window = 5) #a2 
    #model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=5, sg=0, size = 300, sample=1e-3, hs=1, window = 5) #a3
    #model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=10, sg=1, size = 300, sample=1e-3, hs=0, window = 5) #a4
    #model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=10, sg=1, size = 300, sample=1e-5, hs=0, window = 5) #a5
    model = gensim.models.Word2Vec(contexts, min_count=5, workers=4, negative=3, sg=0, size = 300, sample=1e-4, hs=1, window = 5) #a6
    # ./word2vec -train train100B.txt -read-vocab voc -output vectors.bin -cbow 1 -size 300 -window 5 -negative 3 -hs 0 -sample 1e-5 -threads 12 -binary 1 -min-count 10
    model.init_sims(replace=True)
    model.save(model_file)
github jtourille / yaset / yaset / embed / gensim.py View on Github external
def load_embedding(self):
        """
        Load embedding matrix and word count from gensim object
        :return: nothing
        """

        # Loading gensim object
        logging.debug("-> Loading gensim file")
        gensim_obj = gensim.models.Word2Vec.load(self.embedding_file_path)

        # Copying gensim object embedding matrix
        logging.debug("-> Fetching embedding matrix from gensim model")
        self.embedding_matrix = gensim_obj.wv.syn0

        logging.debug("-> Matrix dimension: {}".format(self.embedding_matrix.shape))

        # Creating token-id mapping
        logging.debug("-> Creating word-id mapping")
        for i, item in enumerate(gensim_obj.wv.index2word, start=1):
            self.word_mapping[item] = i

        logging.debug("-> Creating padding vector (index=0)")
        pad_vector = np.random.rand(1, self.embedding_matrix.shape[1])
        self.embedding_matrix = np.insert(self.embedding_matrix, 0, pad_vector, axis=0)
        self.word_mapping["pad_token"] = 0
github kavgan / nlp-in-practice / word2vec / scripts / query.py View on Github external
import gensim
import gensim.models.word2vec


model2 = gensim.models.Word2Vec.load("../models/model_win_2")
model = gensim.models.Word2Vec.load("../models/model")


word_list = [
    'gross',
    'dirty',
    'location',
    'breakfast',
    'smelly',
    'affordable',
    'hotel staff',
    'manager rude',
    'complimentary',
    'family',
    'awe',
    'shocked',