Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def similarity_3_contexts(p, t):
(bef, bet, aft) = (0, 0, 0)
if t.bef_vector is not None and p.bef_vector is not None:
bef = dot(matutils.unitvec(t.bef_vector), matutils.unitvec(p.bef_vector))
if t.bet_vector is not None and p.bet_vector is not None:
bet = dot(matutils.unitvec(t.bet_vector), matutils.unitvec(p.bet_vector))
if t.aft_vector is not None and p.aft_vector is not None:
aft = dot(matutils.unitvec(t.aft_vector), matutils.unitvec(p.aft_vector))
return 0*bef + 1*bet + 0*aft
def similarity_3_contexts(self, p, t):
(bef, bet, aft) = (0, 0, 0)
if t.bef_vector is not None and p.bef_vector is not None:
bef = dot(matutils.unitvec(t.bef_vector), matutils.unitvec(p.bef_vector))
if t.bet_vector is not None and p.bet_vector is not None:
bet = dot(matutils.unitvec(t.bet_vector), matutils.unitvec(p.bet_vector))
if t.aft_vector is not None and p.aft_vector is not None:
aft = dot(matutils.unitvec(t.aft_vector), matutils.unitvec(p.aft_vector))
return self.config.alpha*bef + self.config.beta*bet + self.config.gamma*aft
def similarity_3_contexts(self, t, p):
(bef, bet, aft) = (0, 0, 0)
if t.bef_vector is not None and p.bef_vector is not None:
bef = dot(
matutils.unitvec(t.bef_vector), matutils.unitvec(p.bef_vector)
)
if t.bet_vector is not None and p.bet_vector is not None:
bet = dot(
matutils.unitvec(t.bet_vector), matutils.unitvec(p.bet_vector)
)
if t.aft_vector is not None and p.aft_vector is not None:
aft = dot(
matutils.unitvec(t.aft_vector), matutils.unitvec(p.aft_vector)
)
return self.config.alpha*bef + \
self.config.beta*bet + \
self.config.gamma*aft
else:
tfidf = gensim.models.TfidfModel(corpus)
tfidf.save(tfidf_file)
# build/load LSI model, on top of the TF-IDF model
lsi_file = outfile('lsi.model')
if os.path.exists(lsi_file):
lsi = gensim.models.LsiModel.load(lsi_file)
else:
lsi = gensim.models.LsiModel(tfidf[corpus], id2word=corpus.dictionary, num_topics=NUM_TOPICS, chunksize=10000)
lsi.save(lsi_file)
# convert all articles to latent semantic space, store the result as a MatrixMarket file
# normalize all vectors to unit length, to simulate cossim in libraries that only support euclidean distance
vectors_file = os.path.join(outdir, 'lsi_vectors.mm')
gensim.corpora.MmCorpus.serialize(vectors_file, (gensim.matutils.unitvec(vec) for vec in lsi[tfidf[corpus]]))
logger.info("finished running %s" % program)
def similarity_3_contexts(self, p, t):
(bef, bet, aft) = (0, 0, 0)
if t.bef_vector is not None and p.bef_vector is not None:
bef = dot(matutils.unitvec(t.bef_vector), matutils.unitvec(p.bef_vector))
if t.bet_vector is not None and p.bet_vector is not None:
bet = dot(matutils.unitvec(t.bet_vector), matutils.unitvec(p.bet_vector))
if t.aft_vector is not None and p.aft_vector is not None:
aft = dot(matutils.unitvec(t.aft_vector), matutils.unitvec(p.aft_vector))
return self.config.alpha*bef + self.config.beta*bet + self.config.gamma*aft
Returns
-------
{`scipy.sparse.csr.csr_matrix`, list of (int, float)}
Similarities given document or corpus and objects corpus, depends on `query`.
"""
is_corpus, query = utils.is_corpus(query)
if self.normalize:
# self.normalize only works if the input is a plain gensim vector/corpus (as
# advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix
# as well, but in that case assume tricks are happening and don't normalize
# anything (self.normalize has no effect).
if not matutils.ismatrix(query):
if is_corpus:
query = [matutils.unitvec(v) for v in query]
else:
query = matutils.unitvec(query)
result = self.get_similarities(query)
if self.num_best is None:
return result
# if maintain_sparsity is True, result is scipy sparse. Sort, clip the
# topn and return as a scipy sparse matrix.
if getattr(self, 'maintain_sparsity', False):
return matutils.scipy2scipy_clipped(result, self.num_best)
# if the input query was a corpus (=more documents), compute the top-n
# most similar for each document in turn
if matutils.ismatrix(result):
return [matutils.full2sparse_clipped(v, self.num_best) for v in result]
def mean_rep(v1, v2):
return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0)))
def similarity_3_contexts(self, p, t):
(bef, bet, aft) = (0, 0, 0)
if t.bef_vector is not None and p.bef_vector is not None:
bef = dot(
matutils.unitvec(t.bef_vector), matutils.unitvec(p.bef_vector)
)
if t.bet_vector is not None and p.bet_vector is not None:
bet = dot(
matutils.unitvec(t.bet_vector), matutils.unitvec(p.bet_vector)
)
if t.aft_vector is not None and p.aft_vector is not None:
aft = dot(
matutils.unitvec(t.aft_vector), matutils.unitvec(p.aft_vector)
)
return self.config.alpha*bef + self.config.beta*bet + self.config.gamma*aft
>>> index.add_documents(one_more_corpus) # add more documents in corpus
"""
min_ratio = 1.0 # 0.5 to only reopen shards that are <50% complete
if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize:
# The last shard was incomplete (<; load it back and add the documents there, don't start a new shard
self.reopen_shard()
for doc in corpus:
if isinstance(doc, numpy.ndarray):
doclen = len(doc)
elif scipy.sparse.issparse(doc):
doclen = doc.nnz
else:
doclen = len(doc)
if doclen < 0.3 * self.num_features:
doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T, self.norm)
else:
doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features), self.norm)
self.fresh_docs.append(doc)
self.fresh_nnz += doclen
if len(self.fresh_docs) >= self.shardsize:
self.close_shard()
if len(self.fresh_docs) % 10000 == 0:
logger.info("PROGRESS: fresh_shard size=%i", len(self.fresh_docs))