Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def load_w2v(word):
model = word2vec.Word2Vec.load('../src/' + model_file)
try:
similar_words = model.most_similar(positive=[word])
return random.choice([w[0] for w in similar_words])
except:
return word
def __init__(self, path_to_sense_model, path_to_context_model, window=10, method="sep", filter_ctx=False):
self.vs = word2vec.Word2Vec.load_word2vec_format(path_to_sense_model, binary=True)
self.vc = word2vec.Word2Vec.load_word2vec_format(path_to_context_model, binary=True)
self.window = window
self.ctx_method = method
self.filter_ctx = filter_ctx
print("Disambiguation method: " + self.ctx_method)
print("Filter context: %s" % (self.filter_ctx))
def extractFeaturesW2V(w2vmodel="skip_nostop_multi_300features_10minwords_10context", phrasemodel="phrase.model", useDev = False):
if useDev == False:
tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2)
tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2)
else:
tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2)
tweets_origdev, targets_origdev, labels_origdev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2)
tweets_train.extend(tweets_origdev)
targets_train.extend(targets_origdev)
labels_train.extend(labels_origdev)
tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILETEST, 'windows-1252', 2)
phmodel = Phrases.load(phrasemodel)
w2vmodel = word2vec.Word2Vec.load(w2vmodel)
features_train_w2v = extractW2VAggrFeatures(w2vmodel, phmodel, tweets_train, targets_train, labels_train)
features_dev_w2v = extractW2VAggrFeatures(w2vmodel, phmodel, tweets_dev, targets_dev, labels_dev)
return features_train_w2v, labels_train, features_dev_w2v, labels_dev
def mode_training():
"""
模型训练
"""
# 读取文件下下面的文件
# sentences = MySentences('/some/directory')
# 分词数据
sentences = word2vec.Text8Corpus('data/xuezhong_seg_1.txt')
# 训练 size参数主要是用来设置神经网络的层数
# workers参数用于设置并发训练时候的线程数,不过仅当Cython安装的情况
model = word2vec.Word2Vec(
sentences, min_count=20, size=4000, window=10, workers=4)
# model.sort_vocab()
# 计算两个词的相似度/相关程度
# simil_1 = model.wv.similarity(u"王仙芝", u"老怪物")
# simil_2 = model.wv.similarity(u"徐凤年", u"殿下")
# print("【王仙芝】和【老怪物】相似度:", simil_1)
# print("【徐凤年】和【世子】相似度:", simil_2)
# # 计算某个词的相关词列表
# lar = model.wv.most_similar(u"徐凤年", topn=20) # 20个最相关的
def predict_input():
from collections import Counter
import pandas as pd
import numpy as np
import gensim
import jieba
model = gensim.models.word2vec.Word2Vec.load('w2v_model_wiki_word')
def keywords(s):
"""
codes from 苏剑林. (2017, Apr 07). 《【不可思议的Word2Vec】 3.提取关键词 》[Blog post].Retrieved from https://www.spaces.ac.cn/archives/4316
:param s:
:return:
"""
def predict_proba(oword, iword):
iword_vec = model[iword]
oword = model.wv.vocab[oword]
oword_l = model.syn1[oword.point].T
dot = np.dot(iword_vec, oword_l)
lprob = -sum(np.logaddexp(0, -dot) + oword.code * dot)
return lprob
s = [w for w in s if w in model]
:param postprocess: force against/favor for tweets which contain the target
:param shortenTargets: shorten the target text, see preprocess.transform_targets()
:param useAutoTrump: use automatically annotated Trump tweets, experimental, not helping at the moment
:param useClinton: add the Hillary Clinton dev data to train data
:param testSetting: evaluate on Trump
"""
if word2vecmodel == "small":
w2vmodel = word2vec.Word2Vec.load("../out/skip_nostop_single_100features_5minwords_5context")
else:
w2vmodel = word2vec.Word2Vec.load("../out/skip_nostop_single_100features_5minwords_5context_big")
if usePhrases == True:
phrasemodel = Phrases.load("../out/phrase_all.model")
w2vmodel = word2vec.Word2Vec.load("../out/skip_nostop_multi_100features_5minwords_5context")
if testSetting == "true":
trainingdata = "../data/semeval2016-task6-train+dev.txt"
testdata = "../data/SemEval2016-Task6-subtaskB-testdata-gold.txt"
elif testSetting == "weaklySup":
trainingdata = "../data/trump_autolabelled.txt"
testdata = "../data/SemEval2016-Task6-subtaskB-testdata-gold.txt"
enc = "utf-8"
else:
trainingdata = "../data/semeval2016-task6-trainingdata_new.txt"
testdata = "../data/semEval2016-task6-trialdata_new.txt"
if useClinton == False:
trainingdata = "../data/semeval2016-task6-trainingdata_new.txt"
tweets, targets, labels, ids = reader.readTweetsOfficial(trainingdata, encoding=enc)
def create_model(self, fname='text8'):
sentences = word2vec.Text8Corpus('data/text8')
model = word2vec.Word2Vec(sentences, size=self.dim)
model.save('data/text8.model')
print(':: model saved to data/text8.model')
def main():
nlp = spacy.load('en')
#nlp = whitespace_nlp_with_sentences
convention_df = SampleCorpora.ConventionData2012.get_data()
convention_df['parsed'] = convention_df.text.apply(nlp)
corpus = (CorpusFromParsedDocuments(convention_df,
category_col='party',
parsed_col='parsed')
.build()
.get_unigram_corpus())
model = word2vec.Word2Vec(size=100,
alpha=0.025,
window=5,
min_count=5,
max_vocab_size=None,
sample=0,
seed=1,
workers=1,
min_alpha=0.0001,
sg=1,
hs=1,
negative=0,
cbow_mean=0,
iter=10,
null_word=0,
trim_rule=None,
sorted_vocab=1)
def train(self, corpus):
self.model = word2vec.Word2Vec(size=self.dim, min_count=self.min_count,\
window=self.window, workers=multiprocessing.cpu_count(), \
sg=self.sg, hs=self.hs, negative=self.negative, iter=self.epoches)
self.model.build_vocab(corpus())
self.model.train(corpus())
return self
from gensim.models import word2vec
from RandomVec import RandomVec
import numpy as np
import random
import pickle as pkl
import sys, pickle as pkl
WORD_DIM = 300
model = word2vec.Word2Vec.load_word2vec_format('../pickles/GoogleNews-vectors-negative300.bin', binary=True)
rvec = RandomVec(300)
def findMaxLenght(FILE_NAME):
temp = 0
max_lenght = 0
for line in open(FILE_NAME):
if line in ['\n', '\r\n']:
if temp > max_lenght:
max_lenght = temp
temp = 0
else:
temp += 1
return max_lenght