Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
text_object_level_split=config["text_object_level_split"],
phrase_model=phrase_model,
)
target_corpus: CorpusVectorizer = CorpusVectorizer(
target_texts,
text_object_definition=config["text_object_definition"],
dictionary=source_corpus.dictionary,
min_text_obj_length=config["min_text_obj_length"],
n_chunk=config["n_chunk"],
text_object_level_split=config["text_object_level_split"],
phrase_model=phrase_model,
)
source_corpus.dictionary = target_corpus.dictionary
print("Vectorizing texts...", flush=True)
model: TfidfModel = TfidfModel(chain(source_corpus.vectors, target_corpus.vectors), smartirs="atc")
source_corpus.update_with_tfidf(model)
target_corpus.update_with_tfidf(model)
index: SparseMatrixSimilarity = SparseMatrixSimilarity(
source_corpus, num_features=len(source_corpus.dictionary), num_docs=len(source_corpus)
)
results: np.array = index[target_corpus]
with tqdm(total=source_corpus.length, leave=False) as pbar:
for source_pos, source_vector_results in enumerate(results.T):
filtered_results: np.array = np.where(source_vector_results > config["min_similarity"])[0]
count += len(filtered_results)
for target_pos in filtered_results:
matches.append(
(
PASSAGE_GROUP(
source_corpus[source_pos],
source_corpus.metadata[source_pos]["start_byte"],
f = open(path_data + "lookup.pickle", "r")
lookup = pickle.load(f)
f.close
# print "loading saved dictionary"
dictionary = corpora.Dictionary.load(path_data + "whitaker.dict")
# print "loading saved corpus"
corpus = corpora.MmCorpus(path_data + "whitaker.mm")
# print "loading saved tfidf model"
tfidf = models.TfidfModel.load(path_data + "model.whitaker.tfidf")
# print "creating tfidf wrapper for corpus"
corpus_tfidf = tfidf[corpus]
# print "loading saved lsi model"
lsi = models.LsiModel.load(path_data + "model.whitaker.lsi")
# print "creating lsi wrapper for corpus"
corpus_lsi = lsi[corpus_tfidf]
# print "loading similarities matrix"
index = similarities.MatrixSimilarity.load(path_data + "whitaker.lsi.index")
def __init__(self, doc_reader):
'''
Set TfIdfList attributes.
'''
self.doc_reader = doc_reader
print('Generating Gensim TF-IDF model ...')
tfidf = gensim.models.TfidfModel(self.doc_reader.corpus)
self.scores = tfidf[self.doc_reader.corpus]
wiki.dictionary.filter_tokens(bad_ids=stop_ids)
# save dictionary and bag-of-words (term-document frequency matrix)
MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
# load back the id->word mapping directly from file
# this seems to save more memory, compared to keeping the wiki.dictionary object from above
dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
del wiki
# initialize corpus reader and word->id mapping
mm = MmCorpus(outp + '_bow.mm')
# build tfidf
tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
tfidf.save(outp + '.tfidf_model')
# save tfidf vectors in matrix market format
MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)
logger.info("finished running %s" % program)
def train_lda_model_gensim(corpus, total_topics=2):
norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
dictionary = corpora.Dictionary(norm_tokenized_corpus)
mapped_corpus = [dictionary.doc2bow(text)
for text in norm_tokenized_corpus]
tfidf = models.TfidfModel(mapped_corpus)
corpus_tfidf = tfidf[mapped_corpus]
lda = models.LdaModel(corpus_tfidf,
id2word=dictionary,
iterations=1000,
num_topics=total_topics)
return lda
def transform_vec(corpus, query_corpus):
tfidf = gensim.models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
query_tfidf = tfidf[query_corpus]
return corpus_tfidf, query_tfidf
# convert each sample to a bag of words
print "converting each doc to bag-of-words"
corpus = [dictionary.doc2bow(doc) for doc in corpus]
corpora.MmCorpus.serialize(fs_data + '/whitaker.mm', corpus)
#
# this bit is copied from
# http://radimrehurek.com/gensim/tut2.html
#
print "creating tfidf model"
tfidf = models.TfidfModel(corpus)
tfidf.save(fs_data + "/model.whitaker.tfidf")
print "transforming the corpus to tfidf"
corpus_tfidf = tfidf[corpus]
print "creating lsi model"
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300)
lsi.save(fs_data + "/model.whitaker.lsi")
print "transforming the corpus to lsi"
corpus_lsi = lsi[corpus_tfidf]
def _process_corpus(self):
# Get MatrixMarket format corpus
print("\tConverting corpus")
gensim.corpora.MmCorpus.serialize(
self.fname + '.mm', self.corpus, progress_cnt=100
)
mm_corpus = gensim.corpora.MmCorpus(self.fname + '.mm')
# Get TF-IDF model
print("\tComputing TF-IDF")
tfidf = gensim.models.TfidfModel(
mm_corpus, id2word=self.dictionary, normalize=True
)
gensim.corpora.MmCorpus.serialize(
self.fname + '_tfidf.mm', tfidf[mm_corpus], progress_cnt=100
)
# Reload as Matrix Market format
print("\tConverting TF-IDF")
self.tfidf_mm = gensim.corpora.MmCorpus(self.fname + '_tfidf.mm')
def __init__(self, oblige_fit, path):
super().__init__(oblige_fit, path)
with open(path + 'tags_embs.pkl', 'rb') as file:
self.embs = pickle.load(file)
self.tp = TextProcessor(path)
self.lda_dic = Dictionary.load(path + 'questions.lda_dic')
self.lda_tfidf = TfidfModel.load(path + 'questions.lda_tfidf')
self.lda_model = LdaMulticore.load(path + 'questions.lda_model')
self.d2v = Doc2Vec.load(path + 'questions.d2v')
self.features = {
'categorical': [],
'numerical': {
'zero': ['questions_body_length', 'questions_tag_count'],
'mean': []
},
'date': ['questions_date_added']
}
self._unroll_features()