Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
random.seed( random_seed )
log.info("Using random seed %s" % random_seed )
# Will we use automatic model selection?
validation_measure = None
if len(kparts) == 1:
kmax = kmin
else:
kmax = int(kparts[1])
if kmax < kmin:
kmax = kmin
# any word2vec model specified?
if not options.model_path is None:
log.info( "Loading Word2Vec model from %s ..." % options.model_path )
import gensim
model = gensim.models.Word2Vec.load(options.model_path)
validation_measure = unsupervised.coherence.WithinTopicMeasure( unsupervised.coherence.ModelSimilarity(model) )
# NMF implementation
impl = unsupervised.nmf.SklNMF( max_iters = options.maxiter, init_strategy = "nndsvd" )
# Process each specified time window document-term matrix
selected_ks = []
for matrix_filepath in args:
# Load the cached corpus
window_name = os.path.splitext( os.path.split( matrix_filepath )[-1] )[0]
log.info( "- Processing time window matrix for '%s' from %s ..." % (window_name,matrix_filepath) )
(X,terms,doc_ids) = text.util.load_corpus( matrix_filepath )
log.info( "Read %dx%d document-term matrix" % ( X.shape[0], X.shape[1] ) )
# Ensure that value of kmin and kmax are not greater than the number of documents
num_docs = len(doc_ids)
# -*- coding:utf-8 -*-
from __future__ import unicode_literals
import logging
import gensim
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import VectorizerMixin
from wende.classification.nlp import tokenize
from wende.config import WORD2VEC_MODEL_DIR, WORD2VEC_MODEL_SIZE
logging.info("loading word2vec model...")
w2v_model = gensim.models.Word2Vec.load(WORD2VEC_MODEL_DIR)
def gen_doc_vec(words, num_features):
# remove unseen terms
words = filter(lambda x: x in w2v_model, words)
doc_vec = np.zeros(num_features, dtype="float32")
word_count = 0
for word in words:
word_count += 1
doc_vec += w2v_model[word]
word_count = 1 if word_count == 0 else word_count
doc_vec /= word_count
return doc_vec
def __init__(self,w2vf='../resources/wordemb/w2v/c10_w3_s100',
sswef='../resources/wordemb/sswe'):
self.w2v=gensim.models.Word2Vec.load(w2vf)
self.sswe=readTang(sswef)
self.lexicons=lexicon()
def load_word2vec():
"""
Loads the word2vec model used in this work.
:return: a word2vec model.
"""
return Word2Vec.load(MODEL_SOURCE)
def get_w2v(path):
"""
Reading word2vec model given the path
"""
return gensim.models.Word2Vec.load(path)
def smart_load_embedding(model_path, doc2vec=False):
print("Smart loading", model_path)
if model_path is None:
return None
_, ext = os.path.splitext(model_path)
if doc2vec:
print("Loading Doc2Vec model:", model_path)
model = Doc2Vec.load(model_path)
elif ext == ".gnsm": # Native format
print("Loading embeddings in native gensim format: {}"
.format(model_path))
model = Word2Vec.load(model_path)
else: # either word2vec text or word2vec binary format
binary = ".bin" in model_path
print("Loading embeddings in word2vec format: {}".format(model_path))
model = Word2Vec.load_word2vec_format(model_path, binary=binary)
return model
def main(vocab_size, Y, data, vocab_min):
wv_file = os.path.join(DATA_DIR, "raw.w2v") if data == "raw" else os.path.join(DATA_DIR, "processed_%d.w2v" % (Y))
model = gensim.models.Word2Vec.load(wv_file)
wv = model.wv
#free up memory
del model
v_dict, _ = load_lookups(vocab_size, Y, vocab_min)
#go through vocab in order
#find vocab word in wv.index2word, then call wv.word_vec(wv.index2word[i])
#put results into one big matrix
W, words = build_matrix(v_dict, wv)
#smash that save button
outfile = os.path.join(DATA_DIR, "raw.embed") if data == "raw" else os.path.join(DATA_DIR, "processed_%d.embed" % (Y))
save_embeddings(W, words, outfile)
:param use_stopword: 是否使用停用词
:param stop_words_file: 停用词文件路径
:param use_w2v: 是否使用词向量计算句子相似性
:param dict_path: 词向量字典文件路径
:param max_iter: 最大迭代伦茨
:param tol: 最大容忍误差
"""
if use_w2v==False and dict_path!=None:
raise RuntimeError("再使用词向量之前必须令参数use_w2v=True")
self.__use_stopword = use_stopword
self.__use_w2v=use_w2v
self.__dict_path = dict_path
self.__max_iter = max_iter
self.__tol = tol
if self.__use_w2v:
self.__word2vec = Word2Vec.load(self.__dict_path)
self.__stop_words = set()
self.__stop_words_file = self.get_default_stop_words_file()
if type(stop_words_file) is str:
self.__stop_words_file = stop_words_file
if use_stopword:
for word in codecs.open(self.__stop_words_file, 'r', 'utf-8', 'ignore'):
self.__stop_words.add(word.strip())
np.seterr(all='warn')#Print a RuntimeWarning for all types of floating-point errors
model_type = sys.argv[3]
model_name = str(num_features) + "features_" + str(min_word_count) + "minwords_" + str(
context) + "context_len2alldata"
# Load train data.
train = pd.read_csv('data/train_v2.tsv', header=0, delimiter="\t")
# Load test data.
test = pd.read_csv('data/test_v2.tsv', header=0, delimiter="\t")
all = pd.read_csv('data/all_v2.tsv', header=0, delimiter="\t")
assert model_type in ["word2vec", "fasttext"]
if model_type == "word2vec":
# Load the trained Word2Vec model.
model = Word2Vec.load(model_name)
# Get wordvectors for all words in vocabulary.
word_vectors = model.wv.vectors
index2word = model.wv.index2word
elif model_type == "fasttext":
# Load the trained FastText model.
model = FastText.load(model_name)
# Get wordvectors for all words in vocabulary.
word_vectors = model.wv.vectors
index2word = model.wv.index2word
# Set number of clusters.
num_clusters = int(sys.argv[2])
# Uncomment below line for creating new clusters.
idx, idx_proba = cluster_GMM(num_clusters, word_vectors)
# Uncomment below lines for loading saved cluster assignments and probabaility of cluster assignments.
#Author: Phi Van Thuy
#Purpose: Convert word2vec models to JSON database by cosine distance metric
from gensim.models import Word2Vec
#Trained model
# model_path = "/cl/work/thuy-ph/word2vec/GoogleNews-vectors-negative300.bin"
print "Loading model..."
model = Word2Vec.load('emma-model')
# model = word2vec.Word2Vec.load_word2vec_format(model_path, binary=True) # C binary format
print "Loading model: Done"
#Name of output file
f = open('en_data_cosine_skipgram_original.json','w')
f.write("{\n")
number_words = len(model.vocab)
#number_words = 10000
for i in range(0, number_words):
stringA = model.vocab.items()[i][0]
f.write("\n\"" + stringA.encode("utf-8") + "\":[\n")
nearest_words = model.most_similar(positive=[stringA], negative=[], topn=20)
number_nearest_words = len(nearest_words)