Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
###################
# Word Embeddings #
###################
import gensim
import matplotlib.pyplot as plt
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
my_path = 'C:\\Users\\ASUS\\Documents\\Telecom\\PRIM\\code\\w2v\\'
# quora corpus
corpus = [clean_sent(q).split() for q in df['question1']]
for q in df['question2']: corpus.append(clean_sent(q).split())
# initialize W2V model
my_model = gensim.models.word2vec.Word2Vec(size=300, min_count=2, sg=1)
my_model.build_vocab(corpus)
# update with Glove
my_model.intersect_word2vec_format(my_path + "glove.6B.300d.txt",binary=False)
# fine tune on quora corpus
my_model.train(corpus, total_examples=my_model.corpus_count, epochs=my_model.iter)
# trim memory
my_model.init_sims(replace=True)
# Word Mover's Distance
wmd_true = []
for q1,q2 in tqdm(zip(df_true_duplicate['question1'],df_true_duplicate['question2'])):
clean_q1 = clean_sent(q1).split()
clean_q2 = clean_sent(q2).split()
else:
model = None
if fileType == u'opened':
print('training model from singleFile!')
model = Word2Vec(LineSentence(corpusFilePath), size=self._size, window=self._window, min_count=self._minCount, workers=self._workers)
elif fileType == u'file':
corpusFile = open(corpusFilePath, u'r')
print('training model from singleFile!')
model = Word2Vec(LineSentence(corpusFile), size=self._size, window=self._window, min_count=self._minCount, workers=self._workers)
corpusFile.close()
elif fileType == u'directory':
corpusFiles = localFileOptUnit.listAllFilePathInDirectory(corpusFilePath)
print('training model from listFiles of directory!')
sentences = localFileOptUnit.loadSetencesFromFiles(corpusFiles)
model = Word2Vec(sentences, size=self._size, window=self._window, min_count=self._minCount, workers=self._workers)
elif fileType == u'other':
# TODO add sentences list directly
pass
model.save(self.modelPath)
model.init_sims()
print('producing word2vec model ... ok!')
return model
def train_w2v():
"""
训练word2vec模型, 并保存
:return: model
"""
sentences = word2vec.Text8Corpus("../data/corpus_seg.txt") # 加载语料
model = word2vec.Word2Vec(sentences, size=200, min_count=1, window=10) # 训练skip-gram模型
# NOTE: word2vec的参数意义和选择: https://github.com/lxw0109/NLPExperiments/blob/master/word2vec/doc/Learning%20Notes%20on%20word2vec.ipynb
return model
"""
# OK
try:
from sklearn.model_selection import train_test_split
except ImportError:
from sklearn.cross_validation import train_test_split
parser = argparse.ArgumentParser(description='Preparation.')
parser.add_argument('--w2v', default='all.norm-sz100-w10-cb0-it1-min100.w2v', nargs='?',
help='Path to the word2vec model.')
parser.add_argument('--seed', default=228, type=int, nargs='?', help='Random seed.')
args = vars(parser.parse_args())
RANDOM_SEED = args['seed']
random.seed(RANDOM_SEED)
w2v = Word2Vec.load_word2vec_format(args['w2v'], binary=True, unicode_errors='ignore')
w2v.init_sims(replace=True)
print('Using %d word2vec dimensions from "%s".' % (w2v.layer1_size, args['w2v']))
def read_subsumptions(filename):
subsumptions = []
with codecs.open(filename, encoding='utf-8') as f:
reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
for row in reader:
subsumptions.append((row[0], row[1]))
return subsumptions
from gensim.models import word2vec
import logging
sentences = word2vec.Text8Corpus('/tmp/text8')
model = word2vec.Word2Vec(sentences, size=200)
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
import json
from nltk.tokenize import TweetTokenizer
from gensim.models import word2vec
tokenzer = TweetTokenizer()
model = word2vec.Word2Vec.load_word2vec_format('model.bin', binary=True)
with open('mscoco_train2014_annotations.json', 'r') as f:
dataAnno = json.load(f)
with open('MultipleChoice_mscoco_train2014_questions.json', 'r') as f:
dataQuestion = json.load(f)
feaFile = open('trainRegionFea.txt','w')
choicelist = {}
answerlist = {}
# print dataQuestion['num_choices']
for question in dataAnno['annotations']:
choicelist[question['question_id']] = question['multiple_choice_answer']
#print choicelist[question['question_id']]
errorword = 0
erroranswer = 0
errorquestion = 0
#return [len(word)/0.2 ]
sws=list(LineScoredWordSentence(input_file,dummy_score_vec))
#print sws[0]
from word2veckeras import Word2VecKeras
parameters = [{'size':[5],'hs':[0,1],'negative':[0,5],'sg':[0,1] }]
from sklearn.grid_search import ParameterGrid
for param in ParameterGrid(parameters):
if (param['hs']==0 and param['negative']==0) :
continue
print param
svk=ScoreWord2VecKeras(sws,**param)
vsk = Word2VecKeras(gensim.models.word2vec.LineSentence(input_file),**param)
vs = gensim.models.word2vec.Word2Vec(gensim.models.word2vec.LineSentence(input_file),**param)
print( svk.most_similar('the', topn=5))
print( vsk.most_similar('the', topn=5))
print( vs.most_similar('the', topn=5))
print(svk['the'])
print(vsk['the'])
print(vs['the'])
# #svk.save_word2vec_format('tmp.vec')
# #svk.save('tmp.model')
#print svk.score_vector_size
scored_word_list=[
['This',[20*0.1,10*0.2]],
['is',[10*0.1,5*0.2]],
['a',[30*0.1,10*0.2]],
from operator import itemgetter
import os.path
import codecs
import math
import numpy as np
from gensim.models import word2vec
default_count = 100 # arbitrary, should be larger than min_count of vec object, which is 5 by default
class SenseGram(word2vec.Word2Vec):
def __init__(self, *args, **kwargs):
super(SenseGram, self).__init__(*args, **kwargs)
self.probs = {} # mapping from a sense (String) to its probability
def get_senses(self, word, ignore_case=False):
""" returns a list of all available senses for a given word.
example: 'mouse' -> ['mouse#0', 'mouse#1', 'mouse#2']
Assumption: senses use continuous numbering"""
words = [word]
senses = []
if ignore_case:
words.append(word[0].upper() + word[1:])
words.append(word[0].lower() + word[1:])
words = set(words)
for word in words:
def get_predict_vecs(cls, words):
n_dim = 300
imdb_w2v = Word2Vec.load('svm_data/w2v_model/w2v_model.pkl')
# imdb_w2v.train(words)
train_vecs = cls.build_wordvector(words, n_dim, imdb_w2v)
# print train_vecs.shape
return train_vecs
def save(self, *args, **kwargs):
# don't bother storing the cached normalized vectors, recalculable table
# TODO: after introducing KeyedVectors now syn0, vocab, id2word are saved TWO times. Once in word2vec and once in keyedvectors
# After keyedvectors are deprecated it will be only once
Word2Vec.disable_keyed_vectors_warnings()
kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'table', 'cum_table'])
super(Word2Vec, self).save(*args, **kwargs)
Word2Vec.enable_keyed_vectors_warnings()