How to use the gensim.models.TfidfModel function in gensim

To help you get started, we’ve selected a few gensim examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github ARTFL-Project / text-pair / lib / textpair / vector_similarity.py View on Github external
text_object_level_split=config["text_object_level_split"],
        phrase_model=phrase_model,
    )

    target_corpus: CorpusVectorizer = CorpusVectorizer(
        target_texts,
        text_object_definition=config["text_object_definition"],
        dictionary=source_corpus.dictionary,
        min_text_obj_length=config["min_text_obj_length"],
        n_chunk=config["n_chunk"],
        text_object_level_split=config["text_object_level_split"],
        phrase_model=phrase_model,
    )
    source_corpus.dictionary = target_corpus.dictionary
    print("Vectorizing texts...", flush=True)
    model: TfidfModel = TfidfModel(chain(source_corpus.vectors, target_corpus.vectors), smartirs="atc")
    source_corpus.update_with_tfidf(model)
    target_corpus.update_with_tfidf(model)
    index: SparseMatrixSimilarity = SparseMatrixSimilarity(
        source_corpus, num_features=len(source_corpus.dictionary), num_docs=len(source_corpus)
    )
    results: np.array = index[target_corpus]
    with tqdm(total=source_corpus.length, leave=False) as pbar:
        for source_pos, source_vector_results in enumerate(results.T):
            filtered_results: np.array = np.where(source_vector_results > config["min_similarity"])[0]
            count += len(filtered_results)
            for target_pos in filtered_results:
                matches.append(
                    (
                        PASSAGE_GROUP(
                            source_corpus[source_pos],
                            source_corpus.metadata[source_pos]["start_byte"],
github tesserae / tesserae / branches / semantics / gensim-whitaker / benchmark.py View on Github external
f = open(path_data + "lookup.pickle", "r")
lookup = pickle.load(f)
f.close

# print "loading saved dictionary"

dictionary = corpora.Dictionary.load(path_data + "whitaker.dict")

# print "loading saved corpus"

corpus = corpora.MmCorpus(path_data + "whitaker.mm")

# print "loading saved tfidf model"

tfidf = models.TfidfModel.load(path_data + "model.whitaker.tfidf")

# print "creating tfidf wrapper for corpus"

corpus_tfidf = tfidf[corpus]

# print "loading saved lsi model"

lsi = models.LsiModel.load(path_data + "model.whitaker.lsi")

# print "creating lsi wrapper for corpus"

corpus_lsi = lsi[corpus_tfidf]

# print "loading similarities matrix"

index = similarities.MatrixSimilarity.load(path_data + "whitaker.lsi.index")
github KBNLresearch / frame-generator / frame-generator / models.py View on Github external
def __init__(self, doc_reader):
        '''
        Set TfIdfList attributes.
        '''
        self.doc_reader = doc_reader

        print('Generating Gensim TF-IDF model ...')
        tfidf = gensim.models.TfidfModel(self.doc_reader.corpus)
        self.scores = tfidf[self.doc_reader.corpus]
github bmilde / ambientsearch / python / training / build_wiki_corpus.py View on Github external
wiki.dictionary.filter_tokens(bad_ids=stop_ids)

    # save dictionary and bag-of-words (term-document frequency matrix)
    MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
    # load back the id->word mapping directly from file
    # this seems to save more memory, compared to keeping the wiki.dictionary object from above
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')

    del wiki

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')

    # build tfidf
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(outp + '.tfidf_model')

    # save tfidf vectors in matrix market format
    MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)
github dipanjanS / text-analytics-with-python / Old-First-Edition / source_code / Ch05_Text_Summarization / topic_modeling.py View on Github external
def train_lda_model_gensim(corpus, total_topics=2):
    
    norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
    dictionary = corpora.Dictionary(norm_tokenized_corpus)
    mapped_corpus = [dictionary.doc2bow(text) 
                     for text in norm_tokenized_corpus]
    tfidf = models.TfidfModel(mapped_corpus)
    corpus_tfidf = tfidf[mapped_corpus]
    lda = models.LdaModel(corpus_tfidf, 
                          id2word=dictionary,
                          iterations=1000,
                          num_topics=total_topics)
    return lda
github 5hirish / adam_qas / qas / doc_scorer.py View on Github external
def transform_vec(corpus, query_corpus):
    tfidf = gensim.models.TfidfModel(corpus)

    corpus_tfidf = tfidf[corpus]
    query_tfidf = tfidf[query_corpus]
    return corpus_tfidf, query_tfidf
github tesserae / tesserae / branches / semantics / gensim-whitaker / whitaker.models.py View on Github external
# convert each sample to a bag of words

print "converting each doc to bag-of-words"

corpus = [dictionary.doc2bow(doc) for doc in corpus]

corpora.MmCorpus.serialize(fs_data + '/whitaker.mm', corpus)

#
# this bit is copied from
# 	http://radimrehurek.com/gensim/tut2.html
#

print "creating tfidf model"

tfidf = models.TfidfModel(corpus)

tfidf.save(fs_data + "/model.whitaker.tfidf")

print "transforming the corpus to tfidf"

corpus_tfidf = tfidf[corpus]

print "creating lsi model"

lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300)

lsi.save(fs_data + "/model.whitaker.lsi")

print "transforming the corpus to lsi"

corpus_lsi = lsi[corpus_tfidf]
github snorkel-team / snorkel / snorkel / contrib / embedding / lsa_embedding.py View on Github external
def _process_corpus(self):
		# Get MatrixMarket format corpus
		print("\tConverting corpus")
		gensim.corpora.MmCorpus.serialize(
			self.fname + '.mm', self.corpus, progress_cnt=100
		)
		mm_corpus = gensim.corpora.MmCorpus(self.fname + '.mm')
		# Get TF-IDF model
		print("\tComputing TF-IDF")
		tfidf = gensim.models.TfidfModel(
			mm_corpus, id2word=self.dictionary, normalize=True
		)
		gensim.corpora.MmCorpus.serialize(
			self.fname + '_tfidf.mm', tfidf[mm_corpus], progress_cnt=100
		)
		# Reload as Matrix Market format
		print("\tConverting TF-IDF")
		self.tfidf_mm = gensim.corpora.MmCorpus(self.fname + '_tfidf.mm')
github dataroot / Kaggle-CV / proj / processors.py View on Github external
def __init__(self, oblige_fit, path):
        super().__init__(oblige_fit, path)

        with open(path + 'tags_embs.pkl', 'rb') as file:
            self.embs = pickle.load(file)

        self.tp = TextProcessor(path)
        self.lda_dic = Dictionary.load(path + 'questions.lda_dic')
        self.lda_tfidf = TfidfModel.load(path + 'questions.lda_tfidf')
        self.lda_model = LdaMulticore.load(path + 'questions.lda_model')
        self.d2v = Doc2Vec.load(path + 'questions.d2v')

        self.features = {
            'categorical': [],
            'numerical': {
                'zero': ['questions_body_length', 'questions_tag_count'],
                'mean': []
            },
            'date': ['questions_date_added']
        }

        self._unroll_features()