Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def get_word2vec(content):
word2vec = Word2Vec.load('predictor/model/wiki.zh.seg_200d.model')
res = np.zeros([200])
count = 0
# word_list = content.split()
for word in content:
if word in word2vec:
res += word2vec[word]
count += 1
return pd.Series(res / count)
def createtopicvec(word2vec_path):
max_topicword = 20
model = Word2Vec.load(word2vec_path)
topicmatrix = np.zeros(shape=(100,max_topicword,100),dtype=theano.config.floatX)
file = open(r"\\msra-sandvm-001\v-wuyu\Data\SemEvalCQA"
r"\semeval2015-task3-english-data\pre-process\stemming_preservestop_cate\catedic.txt")
i = 0
miss = 0
for line in file:
tmp = line.strip().split(' ')
for j in range(min(len(tmp),max_topicword)):
if gensim.utils.to_unicode(tmp[j]) in model.vocab:
topicmatrix[i,j,:] = model[gensim.utils.to_unicode(tmp[j])]
else:
miss = miss+1
i= i+1
print "miss word2vec", miss
return topicmatrix
t2 = WMT15QETask2(languagePair2, "../WMT15-data/task2_"+languagePair+"_dev_comb", "../WMT15-data/task2_"+languagePair+"_train_comb", targetWindowSize=targetWindowSize, sourceWindowSize=sourceWindowSize, featureIndices=featureIndices, alignments=s2tAlignments, badWeight=badweight, lowercase=lowerCase, full=full)
contextSize = t2.contextSize
print "... context size", contextSize
#print t2.wordDictionary
vocabularySize = len(t2.wordDictionary)
#load pretrained gensim word2vec model
params = None
if pretrainedModel is not None:
print "... Loading pretrained model from file", pretrainedModel
try:
model = gensim.models.word2vec.Word2Vec.load(pretrainedModel)
#print model["computer"]
lc = False
if ".lc." in pretrainedModel:
lc = True
print "... lowercasing"
#construct initial lookup table from pretrained model
params = constructLT(model,t2.wordDictionary,d_wrd,lc)
except AttributeError: #full model, not only LT pretrained
params = loadParams(pretrainedModel)
#"translate" language pair notation for task 2
languagePair2 = languagePair.upper().replace("-","_")
#get instance vectors and binary labels for training
def evaluate(cat, fold, txt_train, txt_test, y_train, y_test):
pool = multiprocessing.Pool()
wordlists_train = pool.map(preprocess, txt_train)
wordlists_test = pool.map(preprocess, txt_test)
pool.close()
pool.join()
emb = Word2Vec.load(os.path.join(conf.W2V_DIR, 'model'))
# add point at orign for unknown words
emb.wv.syn0 = numpy.vstack((emb.wv.syn0,
numpy.zeros(emb.wv.syn0.shape[1], dtype=numpy.float32)))
# train data: replace words with embedding IDs, zero-padding and truncation
X = numpy.zeros((len(y_train), conf.LSTM_MAXPOSTLEN), dtype=numpy.int32)
X_lengths = numpy.zeros((len(y_train)))
for i, words in enumerate(wordlists_train):
X_lengths[i] = len(words)
for j, w in enumerate(words):
if j >= conf.LSTM_MAXPOSTLEN:
break
if w in emb:
X[i,j] = emb.vocab[w].index
else:
X[i,j] = len(emb.vocab)
print("We're gonna train the model now...")
vec_model.build_vocab(sentences)
# Pass in all of the necessary training variables
vec_model.train(
sentences,
total_examples = vec_model.corpus_count,
epochs = vec_model.iter
)
if not os.path.exists("trained"):
os.makedirs("trained")
vec_model.save(os.path.join("trained", "trained_model.w2v"))
vec_model = w2v.Word2Vec.load(os.path.join("trained", "trained_model.w2v"))
print("We're just gonna compress the dimensions... hang tight!")
# Compress the words into a 2d Vector Space using t-distributed stochastic neighbour embedding
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)
all_word_vectors_matrix = vec_model.wv.syn0
all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)
points = pd.DataFrame(
[
(word, coords[0], coords[1])
for word, coords in [
(word, all_word_vectors_matrix_2d[vec_model.wv.vocab[word].index])
for word in vec_model.wv.vocab
]
def load(self) -> None:
"""Load Word2vec model"""
logger.info("Loading word2vec model from {}".format(self.load_path))
self.model = Word2Vec.load(self.load_path)
def applyWord2VecModel(modelname):
model = word2vec.Word2Vec.load(modelname)
for key in KEYWORDS_LONG['trump']:
print("\n", key)
for res in model.most_similar(key, topn=60):
print(res)
else:
sents = self.sentences_array
self.path = self.path +str(it)+".npy"
print "Learning:" + self.path
print "CCCC!"
if not os.path.exists(self.path):
print "Entra"
entrada = []
results = Parallel(n_jobs=num_cores, backend="threading")(delayed(generate_sample)(self.mode,sents,self.degree,self.w_size,i) for i in range(1,self.ns))
for r in results:
entrada.append(r)
self.w2v = word2vec.Word2Vec(entrada, size=self.ndim, window=self.w_size, min_count=1, workers=num_cores,sg=0)
self.w2v.save(self.path)
print "TERMINO"
else:
self.w2v = word2vec.Word2Vec.load(self.path)
self.get_nodes()
self.get_rels([])
self.delete_props()
def convert_model(prefix):
ln.info("loading model")
w2v = Word2Vec.load(prefix)
ln.info("saving dict...")
dict_file = prefix + ".wordids.txt"
with open(dict_file, "w") as f:
for word, voc_obj in w2v.vocab.items():
f.write((u"%s\t%s\n" % (word, voc_obj.index)).encode("UTF-8"))
ln.info("saving weights as csv...")
weights_file = prefix+".syn0.csv"
np.savetxt(weights_file, w2v.syn0, delimiter=",", header="%s\n%s" % w2v.syn0.shape)
ln.info("all done. Saved converted model files: %s and %s." % (weights_file, dict_file))
def loadModelfromFile(self, modelFilePath):
'''
load model from disk which is already existed
can continue training with the loaded model (need more test)
'''
return Word2Vec.load(modelFilePath)