Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_doc2vec_inference_saveload():
tagged_docs = [TaggedDocument(simple_preprocess(doc), [i])
for i, doc in enumerate(documents)]
model = Doc2Vec(tagged_docs, epochs=1, min_count=1, vector_size=10)
model.save(TEST_FILE)
del model
model = Doc2Vec.load(TEST_FILE)
os.remove(TEST_FILE)
d2v = Doc2VecInference(model, DEFAULT_ANALYZER)
match_op = Matching()
retrieval = Retrieval(d2v, matching=match_op).fit(documents)
result = retrieval.query("scientists")
assert result[0] == 1
''' Doc2Vec model takes in only this LabeledSentence data structure
ex: LabeledSentence(['list', 'of', 'tokenized', 'words'], ['pos_0'])'''
ls = LabeledSentence(preprocess_tweet(sentence).split(), [label + '_%d' % pos_count])
pos_count += 1
else:
ls = LabeledSentence(preprocess_tweet(sentence).split(), [label + '_%d' % neg_count])
neg_count += 1
labeled_sent.append(ls)
logging.info("Training on %d Positive and %d Negative tweets" % (pos_count, neg_count))
logging.info("Building model...")
'''Setting min_count > 1 can cause some tweets to "disappear" later
from the Doc2Vec sentence corpus.
ex: you could imagine a tweet containing only words whose count was low'''
model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5,
workers=7)
logging.info("Building Vocabulary...")
model.build_vocab(labeled_sent)
logging.info("Training model...")
for epoch in xrange(epoch_num):
logging.info("Epoch %s..." % epoch)
# Temporarily sets logging level to show only if its at least WARNING
# This prevents model.train from overloading the log
logging.getLogger().setLevel(logging.WARN)
# Numpy random permutation method shuffles data in place
# Shuffling improves the accuracy of the model
model.train(np.random.permutation(labeled_sent))
logging.getLogger().setLevel(logging.INFO)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
taggeddocs = []
with open(train_corpus, 'r') as f:
docs = f.read().splitlines()
if tokenize:
docs = nlp_clean(docs)
for label, doc in zip(labels, docs):
td = TaggedDocument(words=tokenizer.tokenizer.split(doc.lower(), delimiter=' '), tags=[label])
#print(td)
taggeddocs.append(td)
model = g.Doc2Vec(taggeddocs, size=vector_size, window=window_size, min_count=min_count, sample=sampling_threshold, workers=worker_count, hs=0, dm=dm, negative=negative_size, dbow_words=1, dm_concat=1, pretrained_emb=None, iter=train_epoch)
#save model
model.save(saved_path)
def load_doc2vec_model(texts, model_path):
if os.path.exists(model_path):
model = Word2Vec.load(model_path)
else:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(texts)]
model = Doc2Vec(documents, vector_size=5, window=3, min_count=1, workers=4, size=100, alpha=0.025, iter=40)
model.train(documents, total_examples=model.corpus_count, epochs=model.iter)
# model = Word2Vec(sg=1, sentences=texts, size=256, window=5, min_count=1, iter=40)
# model.save(model_path)
return model
def train(self, size=100, window=8, min_count=5, workers=4):
"""Train Gensim Doc2Vec model.
Parameters
----------
size : int, optional
Dimension of the word2vec space.
"""
tagged_documents = self.iterable_tagged_documents()
self.model = gensim.models.Doc2Vec(
tagged_documents, size=size, window=window, min_count=min_count,
workers=workers)
import json
import gensim.models
from difflib import SequenceMatcher
filename = 'data/acronyms_best.json'
f = open(filename, 'r')
data = json.load(f)
model = gensim.models.Doc2Vec.load('models_context/IC.model')
#print model.docvecs.similarity("Entity-Relationship Model","entity-relationship models")
print model.docvecs.most_similar("Integrated Circuit",topn=20)
#
total =0
correct=0
error=0
def similar(a, b):
a=a.lower()
b=b.lower()
return SequenceMatcher(None, a, b).ratio()
n=0
wrong =0
for k,v in data.items():
try:
model = gensim.models.Doc2Vec.load('models_context/'+k+'.model')
if similar(v["full_form"],model.docvecs.most_similar(v["full_form"])[0][0])>0.80:
model : gensim.Doc2Vec
Trained model.
"""
times_epoch = []
start = datetime.now()
docs_all = list(docs) + list(docs_unsuperv)
docs_all = self.labelize_tokenize_docs(docs_all, self.w2v_label)
docs = self.labelize_tokenize_docs(docs, self.w2v_label)
if model is None:
cores = multiprocessing.cpu_count()
model = gensim.models.Doc2Vec(min_count=3, window=10, size=100,
sample=1e-3, negative=5,
workers=cores)
model.build_vocab(docs_all)
docs_perm = docs_all
for epoch in range(10):
log.info('Doc-2-Vec epoch: {}'.format(epoch))
start_epoch = datetime.now()
random.shuffle(docs_perm)
model.train(docs_perm)
times_epoch.append((start_epoch, datetime.now()))
self.results['d2v-training-times'] = {'start': start,
'stop': datetime.now(),
'epochs': times_epoch}
r = self.get_doc_2_vec_vectors(model, docs)
return r, model
#!/usr/bin/env python
import pandas as pd
import numpy as np
import csv
import gensim, os
doc2vec = gensim.models.Doc2Vec.load('./models/user_stylometric.model')
data = np.asarray(pd.read_csv('./train_balanced_user.csv', header=None))
DIM = 300
directory = "./user_embeddings"
if not os.path.exists(directory):
os.makedirs(directory)
file = open(directory+"/user_stylometric.csv",'w')
wr = csv.writer(file, quoting=csv.QUOTE_ALL)
# Inferring paragraphVec vectors for each user
vectors = np.asarray([doc2vec.infer_vector(data[i][1]) for i in range(data.shape[0])])
users = data[:,0]
for i in range(len(users)):
ls=[]
ls.append(users[i])
def train_and_save_doc2vec(docs, output_file, options = {}):
print "Training model..."
model = Doc2Vec(docs, **options)
model.save(output_file)