How to use the gensim.corpora function in gensim

To help you get started, we’ve selected a few gensim examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github kethort / TwitterLDATopicModeling / src / create_LDA_model.py View on Github external
# use the built-in Gensim lemmatize engine 
    if lemma:
        return utils.lemmatize(text, stopwords=ignore_words, min_length=3)

    # tokenize words using NLTK Twitter Tokenizer
    tknzr = TweetTokenizer()
    text = tknzr.tokenize(text)

    # lowercase, remove words less than len 2 & remove numbers in tokenized list
    return [word.lower() for word in text if len(word) > 2 and not word.isdigit() and not word in ignore_words]

def filenames_to_generator(directory):
    for filename in os.listdir(directory):
        yield directory + str(filename)

class DocCorpus(gensim.corpora.TextCorpus):
    # overrides the get_texts function of Gensim TextCorpus in order to use 
    # directory of texts as corpus, where each text file is a document
    def __init__(self, docs_loc, lemmatize, dictionary=None, metadata=None):
        self.docs_loc = docs_loc
        self.lemmatize = lemmatize
        self.metadata = metadata
        if dictionary is None:
            self.dictionary = Dictionary(self.get_texts())
        else:
            self.dictionary = dictionary
    def get_texts(self):
        pool = multiprocessing.Pool(max(1, multiprocessing.cpu_count() - 1))
        func = partial(preprocess_text, self.lemmatize)
        for tokens in pool.map(func, filenames_to_generator(self.docs_loc)):
            yield tokens
        pool.terminate()
github adparker / GADSLA_1403 / src / lesson15 / gensim_demo.py View on Github external
lame = ['from', 'subject', 'nntp', 'posting', 'host', 'organization', 'lines', 're', 'nntp-posting-host']
stoplist = nltk.corpus.stopwords.words('english') + lame + list(string.punctuation) + ['--', "''", '``', "'s", "n't", '...']

def is_good_word(word):
	return len(word) > 3 and word not in stoplist

# texts is the document turned into a list of words.
texts = [[word for word in nltk.word_tokenize(document.lower()) if is_good_word(word)] for document in documents]

# remove words that appear only once
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
texts = [[word for word in text if word not in tokens_once] for text in texts]

# Creating a dictionary to map words to integers.
dictionary = corpora.Dictionary(texts)

# print(dictionary.token2id).items()[:10]
# new_doc = "Human computer interaction"
# new_vec = dictionary.doc2bow(new_doc.lower().split())
# print(new_vec)

# Vectorize: turn each document (list-of-words) into a vectorized bag-of-words.
corpus = [dictionary.doc2bow(text) for text in texts]

# Transformations, from one vector representation to another.
# E.g. bag-of-words to Tf-IDF
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model. This learns document frequencies.

# You can transform a vectorized bag-of-words:
# doc_bow = [(0, 1), (1, 1)]
# print(tfidf[doc_bow])
github gugug / TensorFlow_DNN_Character_Classification / features / tfidf / tfidf_action.py View on Github external
def prepare_lsi(self, doc):
        # 给训练集用的
        list_total, list_tag = self.load_data(doc)
        stop_word = self.load_stopword()
        texts = [[word for word in document.lower().split() if word not in stop_word]
                 for document in list_total]
        # train dictionary done# 抽取一个bag-of-words,将文档的token映射为id
        dictionary = corpora.Dictionary(texts)  # 生成词典 # {'a': 0, 'damaged': 1, 'gold': 3, 'fire': 2}
        # print dictionary.token2id
        # 产生文档向量,将用字符串表示的文档转换为用id和词频表示的文档向量
        corpus = [dictionary.doc2bow(text) for text in texts]
        # [[(0, 1), (6, 1)], [(0, 1), (9, 2), (10, 1)], [(0, 1), (3, 1)]]
        # 例如(9,2)这个元素代表第二篇文档中id为9的单词出现了2次
        # 用TFIDF的方法计算词频,sublinear_tf 表示学习率
        tfv = TfidfVectorizer(min_df=1, max_df=0.95, sublinear_tf=True, stop_words=stop_word)
        # 对文本中所有的用户对应的所有的评论里面的单词进行TFIDF的计算,找出每个词对应的tfidf值
        X_sp = tfv.fit_transform(list_total)
        # train model done基于这些“训练文档”计算一个TF-IDF模型
        tfidf_model = models.TfidfModel(corpus)
        joblib.dump(tfidf_model, "tfidf_model.model")
        # 转化文档向量,将用词频表示的文档向量表示为一个用tf-idf值表示的文档向量
        corpus_tfidf = tfidf_model[corpus]
        # [[(1, 0.6633689723434505), (2, 0.6633689723434505)],[(7, 0.16073253746956623), (8, 0.4355066251613605)]]
        # 训练LSI模型 即将训练文档向量组成的矩阵SVD分解,并做一个秩为2的近似SVD分解
github dsindex / blog / transform.py View on Github external
def construct_dictionary(documents_path, filter=None) :
    # collect statistics about all tokens
    dictionary = corpora.Dictionary(line.lower().split() for line in open(documents_path))

    if filter :
        # remove stop words and words that appear only once
        stoplist = set('for a of the and to in'.split())
        stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
        once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
        dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
        dictionary.compactify() # remove gaps in id sequence after words that were removed

    return dictionary
github mideind / Greynir / vectors / builder.py View on Github external
def load_plain_corpus(self):
        """ Load the plain corpus from file """
        return corpora.MmCorpus(self._PLAIN_CORPUS_FILE)
github practical-recommender-systems / moviegeek / builder / lda_model_calculator.py View on Github external
def save_lda_model(self, lda_model, corpus, dictionary, index):

        index.save(self.lda_path + 'index.lda')
        pyLDAvis.save_json(pyLDAvis.gensim.prepare(lda_model, corpus, dictionary), self.lda_path + '/../static/js/lda.json')
        print(lda_model.print_topics())
        lda_model.save(self.lda_path + 'model.lda')

        dictionary.save(self.lda_path + 'dict.lda')
        corpora.MmCorpus.serialize(self.lda_path + 'corpus.mm', corpus)
github SmokinCaterpillar / TrufflePig / trufflepig / model.py View on Github external
def fill_dictionary(self, tokens):
        """ Fills a dictionary

        Parameters
        ----------
        tokens: list of list of str
            e.g. [['hi', 'ho'], ['my', 'name', ...], ...]

        """
        self.dictionary = corpora.Dictionary(tokens, prune_at=self.prune_at)
        self.dictionary.filter_extremes(self.no_below, self.no_above,
                                        keep_n=self.keep_n)
github nmslib / nmslib / data / data_conv / create_lda_multicore.py View on Github external
if len(sys.argv) != 3:
  raise Exception("Usage:  ")

ntop=int(sys.argv[1])
ncores=int(sys.argv[2])
print "Using " + str(ntop) + " topics and " + str(ncores) + " cores"

lda = gensim.models.LdaMulticore(corpus=mmTrain, id2word=id2word, num_topics= ntop, workers=ncores)

lda_file = 'LDA/lda'+str(ntop)

lda.save(lda_file)

out_vect = 'LDA/wikipedia_lda'+str(ntop)+'.txt'
#gensim.corpora.MmCorpus.serialize(out_vect, (gensim.matutils.unitvec(vec) for vec in lda[mmTest]))
gensim.corpora.MmCorpus.serialize(out_vect, lda[mmTest])
github chrisjmccormick / simsearch / keysearch.py View on Github external
    @classmethod
    def load(cls, save_dir='./'):
        """
        Load the corpus from a save directory.
        """
        tables = pickle.load(open(save_dir + 'tag-tables.pickle', 'rb'))
        tagsToDocs = tables[0]
        docsToTags = tables[1]        
        titles = pickle.load(open(save_dir + 'titles.pickle', 'rb'))
        tfidf_model = TfidfModel.load(fname=save_dir + 'documents.tfidf_model')
        corpus_tfidf = corpora.MmCorpus(save_dir + 'documents_tfidf.mm')
        dictionary = corpora.Dictionary.load(fname=save_dir + 'documents.dict')
        files = pickle.load(open(save_dir + 'files.pickle', 'rb'))
        doc_line_nums = pickle.load(open(save_dir + 'doc_line_nums.pickle', 'rb'))
        
        ksearch = KeySearch(dictionary, tfidf_model, 
                            corpus_tfidf, titles, tagsToDocs,
                            docsToTags, files, doc_line_nums) 
        
        return ksearch
github macks22 / dblp / pipeline / repdocs.py View on Github external
def run(self):
        repdocs = self.read_repdocs()
        dictionary = gensim.corpora.Dictionary(repdocs)
        logging.info('term count in paper repdoc corpus pre-filtering: %d' %
                     len(dictionary))
        dictionary.filter_extremes(2, 1, len(dictionary))
        logging.info('term count in paper repdoc corpus post-filtering: %d' %
                     len(dictionary))
        dictionary.save(self.output().path)