Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def make_index():
logging.info('loading dictionary')
dictionary = gensim.corpora.Dictionary.load_from_text('svd/dictionary.txt')
logging.info('loading corpus')
corpus = gensim.corpora.MmCorpus('svd/corpus.mm')
tfidf = gensim.models.TfidfModel(corpus)
logging.info('loading model')
model = gensim.models.ldamodel.LdaModel.load('svd/lda.txt')
logging.info('building lda docs')
lda_corpus = model[tfidf[corpus]]
logging.info('building index')
index = gensim.similarities.docsim.Similarity('/tmp/lda_index.txt', lda_corpus, 1000)
index.save('svd/lda_index.txt')
def build_pyLDAvis_output(corp_loc, dict_loc, lda_loc):
if not '.model' in lda_loc:
lda_loc += '.model'
corpus = MmCorpus(corp_loc)
dictionary = Dictionary.load(dict_loc)
lda = models.LdaModel.load(lda_loc)
vis_data = gensim_vis.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.save_html(vis_data, lda_loc.split('.model')[0] + '.html')
def create_documents_view(self,corpus, ir_mode):
dictionary,pdocs = self.create_dictionary(corpus)
bow = self.docs2bows(corpus, dictionary,pdocs)
loaded_corpus = corpora.MmCorpus('vsm_docs.mm') # Recover the corpus
if ir_mode == 1:
model = [[(w[0], 1 + np.log2(w[1])) for w in v] for v in bow] # TF model
elif ir_mode == 2:
model = models.TfidfModel(loaded_corpus) # TF IDF model
elif ir_mode == 3:
model = models.LdaModel(loaded_corpus) # LDA model
elif ir_mode == 4:
model = models.LdaMulticore(loaded_corpus) # LDA Multicore model
elif ir_mode == 5:
model = models.LsiModel(loaded_corpus) # LSI model
elif ir_mode == 6:
model = models.RpModel(loaded_corpus) # RP model
elif ir_mode == 7:
model = models.LogEntropyModel(loaded_corpus) # LogEntropyModel model
source = os.path.join(module_path, corpusname + '.bz2')
# save the results to tmp
output = os.path.join(tempfile.gettempdir(), corpusname)
# build dictionary.
logging.info("source: " + source)
wiki = WikiExternParsingCorpus(source, keep_words=200000)
# save dictionary and bag-of-words
wiki.saveAsText(output)
del wiki
# initialize corpus reader and word->id mapping
from gensim.corpora import MmCorpus
id2token = WikiExternParsingCorpus.loadDictionary(output + '_wordids.txt')
mm = MmCorpus(output + '_bow.mm')
# build tfidf
from gensim.models import TfidfModel
tfidf = TfidfModel(mm, id2word=id2token, normalize=True)
# save tfidf vectors in matrix market format
MmCorpus.saveCorpus(output + '_tfidf.mm', tfidf[mm], progressCnt=10000)
logging.info("finished running")
#!/usr/bin/env python
import logging, gensim, bz2, sys
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# load id->word mapping (the dictionary), one of the results of step 2 above
id2word = gensim.corpora.Dictionary.load_from_text('sparse_wiki_wordids.txt')
# load corpus iterator
mm = gensim.corpora.MmCorpus(bz2.BZ2File('sparse_wiki_tfidf.mm.bz2'))
if len(sys.argv) != 2:
raise Exception("Usage: ")
print mm
ntop=int(sys.argv[1])
print "Using " + str(ntop) + " topics "
lsi = gensim.models.lsimodel.LsiModel(corpus=mm, id2word=id2word, num_topics=ntop, chunksize=10000)
lsi_file = 'LSI/lsi'+str(ntop)
lsi.save(lsi_file)
out_vect = 'LSI/wikipedia_lsi'+str(ntop)+'.txt'
from gensim import corpora, models, similarities
# define the data directory
path_data = "/Volumes/CWFDATA/semantics"
# load the data from whitaker.models.py
print "loading saved dictionary"
dictionary = corpora.Dictionary.load(path_data + "/whitaker.dict")
print "loading saved corpus"
corpus = corpora.MmCorpus(path_data + "/whitaker.mm")
print "loading saved tfidf model"
tfidf = models.TfidfModel.load(path_data + "/model.whitaker.tfidf")
print "creating tfidf wrapper for corpus"
corpus_tfidf = tfidf[corpus]
print "loading saved lsi model"
lsi = models.LsiModel.load(path_data + "/model.whitaker.lsi")
print "creating lsi wrapper for corpus"
corpus_lsi = lsi[corpus_tfidf]
from __future__ import absolute_import
import itertools
import logging
import gensim
import numpy as np
import pandas as pd
from .topic_space.dictionaries import iter_corpus
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
corpus_simple_mm = gensim.corpora.MmCorpus('material_science/output/simple.mm')
corpus_simple_dict = gensim.corpora.Dictionary.load('material_science/output/simple.dict')
print(corpus_simple_mm)
# Possible models:
# - LDA
# - LSI
# - TF-IDF
lda_model = gensim.models.LdaModel(corpus_simple_mm, num_topics=10, id2word=corpus_simple_dict, passes=4)
# Transforming
# transform text into the bag-of-words space
#bow_vector = id2word_wiki.doc2bow(tokenize(text))
#print([(id2word_wiki[id], count) for id, count in bow_vector])
import logging
import os
import gensim
MODELS_DIR = "models"
NUM_TOPICS = 4
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
dictionary = gensim.corpora.Dictionary.load(os.path.join(MODELS_DIR, "bok.dict"))
corpus = gensim.corpora.MmCorpus(os.path.join(MODELS_DIR, "bok.mm"))
# Project to LDA space
lda = gensim.models.LdaModel(corpus, id2word=dictionary,
iterations=300,
num_topics=NUM_TOPICS)
ftt = open(os.path.join(MODELS_DIR, "topic_terms.csv"), 'wb')
for topic_id in range(NUM_TOPICS):
term_probs = lda.show_topic(topic_id, topn=50)
for prob, term in term_probs:
ftt.write("%d\t%s\t%.3f\n" % (topic_id, term.replace("_", " "), prob))
ftt.close()
fdt = open(os.path.join(MODELS_DIR, "doc_topics.csv"), 'wb')
for doc_id in range(len(corpus)):
docbok = corpus[doc_id]
#!/usr/bin/env python
import logging, gensim, bz2
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# load id->word mapping (the dictionary), one of the results of step 2 above
id2word = gensim.corpora.Dictionary.load_from_text('sparse_wiki_wordids.txt')
# load corpus iterator
mm = gensim.corpora.MmCorpus(bz2.BZ2File('sparse_wiki_tfidf.mm.bz2'))
print mm
ntop=128
lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics= ntop, update_every=0, passes=20)
lda_file = 'LDA/lda'+str(ntop)
lda.save(lda_file)
out_vect = 'LDA/wikipedia_lda'+str(ntop)+'.txt'
gensim.corpora.MmCorpus.serialize(out_vect, (gensim.matutils.unitvec(vec) for vec in lda[mm]))
if not exists(mm_corpus_file):
print("corpus not found. Starting to build it...")
class CorpusWrapper:
def __init__(self, dictionary):
self._dictionary = dictionary
def __iter__(self):
for tokens in corpus:
yield self._dictionary.doc2bow(tokens)
gensim.corpora.MmCorpus.serialize(mm_corpus_file, CorpusWrapper(dictionary))
mm_corpus = gensim.corpora.MmCorpus(mm_corpus_file)
# generate LDA model
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
ldamodel = gensim.models.LdaMulticore(mm_corpus,
id2word=dictionary,
alpha='asymmetric', eta='auto',
num_topics=args.num_topics,
passes=args.passes,
eval_every=args.eval_every,
batch=True,
chunksize=args.chunksize,
iterations=args.iterations)
print("Saving LDA model...")
ldamodel.save(join(model_dir, 'LDA.model'))
print("Saving words for topics...")